
    Vh,                     |    d Z dZddl ddlmZmZ ddlmZ  G d de      Zdd	Z	 G d
 de
      Z G d de      Zy)z&
Corpus reader for the SemCor Corpus.
z
epytext en    )*)XMLCorpusReaderXMLCorpusView)Treec                   l    e Zd ZdZddZddZddZddfdZddZdd	Z	ddfd
Z
d Zd Zed        Zy)SemcorCorpusReadera  
    Corpus reader for the SemCor Corpus.
    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
    c                 N    t        j                  | ||       || _        || _        y N)r   __init___lazy_wordnet)selfrootfileidswordnetlazys        I/home/dcms/DCMS/lib/python3.12/site-packages/nltk/corpus/reader/semcor.pyr   zSemcorCorpusReader.__init__   s"      tW5
    Nc                 ,    | j                  |dddd      S )zr
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        wordF_itemsr   r   s     r   wordszSemcorCorpusReader.words    s    
 {{7FE5%@@r   c                 ,    | j                  |dddd      S )z
        :return: the given file(s) as a list of chunks,
            each of which is a list of words and punctuation symbols
            that form a unit.
        :rtype: list(list(str))
        chunkFr   r   s     r   chunkszSemcorCorpusReader.chunks'   s     {{7GUE5AAr   posc                 8    | j                  |dd|dk7  |dk7        S )ac  
        :return: the given file(s) as a list of tagged chunks, represented
            in tree form.
        :rtype: list(Tree)

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
            to indicate the kind of tags to include.  Semantic tags consist of
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
            without a specific entry in WordNet.  (Named entities of type 'other'
            have no lemma.  Other chunks not in WordNet have no semantic tag.
            Punctuation tokens have `None` for their part of speech tag.)
        r   Fsemr   r   r   r   tags      r   tagged_chunksz SemcorCorpusReader.tagged_chunks0   s#     {{7GUC5L#,OOr   c                 ,    | j                  |dddd      S )z
        :return: the given file(s) as a list of sentences, each encoded
            as a list of word strings.
        :rtype: list(list(str))
        r   TFr   r   s     r   sentszSemcorCorpusReader.sents?   s     {{7FD%??r   c                 ,    | j                  |dddd      S )z
        :return: the given file(s) as a list of sentences, each encoded
            as a list of chunks.
        :rtype: list(list(list(str)))
        r   TFr   r   s     r   chunk_sentszSemcorCorpusReader.chunk_sentsG   s     {{7GT5%@@r   c                 8    | j                  |dd|dk7  |dk7        S )a  
        :return: the given file(s) as a list of sentences. Each sentence
            is represented as a list of tagged chunks (in tree form).
        :rtype: list(list(Tree))

        :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
            to indicate the kind of tags to include.  Semantic tags consist of
            WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
            without a specific entry in WordNet.  (Named entities of type 'other'
            have no lemma.  Other chunks not in WordNet have no semantic tag.
            Punctuation tokens have `None` for their part of speech tag.)
        r   Tr    r   r   r!   s      r   tagged_sentszSemcorCorpusReader.tagged_sentsO   s#     {{7GT3%<NNr   c                      |dk(  r|s fd}n j                   rt        n j                  }t         j	                  |      D cg c]  } |||||| j
                         c}      S c c}w )Nr   c                  d    t         j                  rt        |        S j                  |        S r
   )LazyConcatenationr   SemcorWordView_words)argsr   s    r   <lambda>z+SemcorCorpusReader._items.<locals>.<lambda>b   s0    />4::F 37;;F r   )r   r-   r.   concatabspathsr   )r   r   unitbracket_sentpos_tagsem_tag_fileids   `       r   r   zSemcorCorpusReader._items^   sl    6>,A #'**$++A #mmG4 &$gwN
 	
s   A*c           	         |dv sJ g }t         j                  |      j                         }|j                  d      D ]  }g }	t	        |      D ]M  }
t
        j                  |
|||| j                        }|dk(  r|	j                  |       =|	j                  |       O |r)|j                  t        |j                  d   |	             |j                  |	        d|vsJ |S )a]  
        Helper used to implement the view methods -- returns a list of
        tokens, (segmented) words, chunks, or sentences. The tokens
        and chunks may optionally be tagged (with POS and sense
        information).

        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        )tokenr   r   z.//sr   snumN)ElementTreeparsegetrootfindall_all_xmlwords_inr   _wordr   extendappendSemcorSentenceattrib)r   r8   r3   r4   r5   r6   resultxmldocxmlsentsentxmlworditms               r   r.   zSemcorCorpusReader._wordsn   s     1111""6*224~~f- 	$GD+G4 %(..T7GT]] 6>KK$KK$% nW^^F-CTJKd#	$  6!!!r   c                    | j                   }|sd}| j                  d|      }| j                  d      }|,|dz   |z   }dt        |j                  d      d         dz
     }	nd x}}	| j                  d	|      }
| j                  d
      }d| j	                         v }| j                  d      }|dk(  r |s|s|}|S |f|r|fndz   |r||	||fndz   }|S |j                  d      }|dk(  r|S |	 |j                  |      }|rt        ||      gn|}|r'|r%|t        t        d|      g      S t        d|      S |r|t        |      S |r|d   S |S # t        $ r3 	 d||	t        |      fz  }n# t        $ r |dz   |	z   dz   |z   }Y nw xY wY w xY w)N lemmalexsn%)nvars:r      rdfwnsnpnr   r:    r7   r   z
%s.%s.%02d.NE)	textgetintsplitkeyslemma_from_key	Exception
ValueErrorr   )rJ   r3   r5   r6   r   tknrN   rO   	sense_keywnposredefsensenumisOOVEntityr   rK   wwsensebottoms                     r   rA   zSemcorCorpusReader._word   s   llCGS)G$e+I-EKK$Q'(1,E !%$I3
 ;;v&glln,kk
 7?7 J	 F!(vb2@Guh<RQ 
 J3Bv~	' ' 6 6y A$ -4$sB-{+#EDv,>+?@@#D&11!5v..!!9$!M; % 	$0 % % #H4 %E
  *  %e 3c 9H D "s6   ,E 	FE,+F,FFFFF)Tr
   )__name__
__module____qualname____doc__r   r   r   r#   r%   r'   r)   r   r.   staticmethodrA   r[   r   r   r   r      s_     
AB %)u P@A $(e O
 #J J" J"r   r   Nc                 t    |g }| D ].  }|j                   dv r|j                  |       #t        ||       0 |S )Nwfpunc)r"   rC   r@   )eltrF   childs      r   r@   r@      sD    ~ ,99&MM% UF+	,
 Mr   c                       e Zd ZdZd Zy)rD   z
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    c                 >    || _         t        j                  | |       y r
   )numlistr   )r   r|   itemss      r   r   zSemcorSentence.__init__   s    dE"r   N)ro   rp   rq   rr   r   r[   r   r   rD   rD      s    
#r   rD   c                   (    e Zd ZdZd Zd Zd Zd Zy)r-   zN
    A stream backed corpus view specialized for use with the BNC corpus.
    c                     |rd}nd}|| _         || _        || _        || _        || _        t        j                  | ||       y)a{  
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        z.*/sz.*/s/(punc|wf)N)_unit_sent_pos_tag_sem_tagr   r   r   )r   r8   r3   r4   r5   r6   r   tagspecs           r   r   zSemcorWordView.__init__   sF     G&G
!
tVW5r   c                 ^    | j                   r| j                  |      S | j                  |      S r
   )r   handle_senthandle_word)r   rx   contexts      r   
handle_eltzSemcorWordView.handle_elt  s+    ::##C((##C((r   c                     t         j                  || j                  | j                  | j                  | j
                        S r
   )r   rA   r   r   r   r   )r   rx   s     r   r   zSemcorWordView.handle_word  s0    !''T]]DMM4==
 	
r   c                    g }|D ]k  }|j                   dv rD| j                  |      }| j                  dk(  r|j                  |       C|j	                  |       Ut        d|j                   z         t        |j                  d   |      S )Nru   r   zUnexpected element %sr;   )r"   r   r   rB   rC   re   rD   rE   )r   rx   rI   ry   rK   s        r   r   zSemcorWordView.handle_sent  s     	FEyyN*&&u-::'KK$KK$ !8599!DEE	F cjj0$77r   N)ro   rp   rq   rr   r   r   r   r   r[   r   r   r-   r-      s    6,)

8r   r-   r
   )rr   __docformat__nltk.corpus.reader.apinltk.corpus.reader.xmldocsr   r   	nltk.treer   r   r@   r}   rD   r-   r[   r   r   <module>r      sF     $ E K" K"\#T #18] 18r   