
    Vh$                        d Z ddlZddl ddl ddlmZ ddlmZ ej                  d      Z
ej                  d      Zej                  d      Zej                  d	      Z G d
 de      Z G d dee      Z G d de      Zy)zO
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
    N)*)map_tag)Treez\((\d+) ([^\s()]+) ([^\s()]+)\)z\(([^\s()]+) ([^\s()]+)\)z\([^\s()]+ ([^\s()]+)\)z
\s*\(\s*\(c                   @    e Zd ZdZ	 	 	 	 d	dZd Zd Zd Zd
dZd Z	y)BracketParseCorpusReaderz
    Reader for corpora that consist of parenthesis-delineated parse trees,
    like those found in the "combined" section of the Penn Treebank,
    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".

    Nc                 ^    t         j                  | |||       || _        || _        || _        y)a  
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param comment_char: The character which can appear at the start of
            a line to indicate that the rest of the line is a comment.
        :param detect_blocks: The method that is used to find blocks
            in the corpus; can be 'unindented_paren' (every unindented
            parenthesis starts a new parse) or 'sexpr' (brackets are
            matched).
        :param tagset: The name of the tagset used by this corpus, to be used
            for normalizing or converting the POS tags returned by the
            ``tagged_...()`` methods.
        N)SyntaxCorpusReader__init___comment_char_detect_blocks_tagset)selfrootfileidscomment_chardetect_blocksencodingtagsets          P/home/dcms/DCMS/lib/python3.12/site-packages/nltk/corpus/reader/bracket_parse.pyr
   z!BracketParseCorpusReader.__init__"   s/    , 	##D$B)+    c           	      v   | j                   dk(  rt        || j                        S | j                   dk(  rt        |      S | j                   dk(  r_t	        |d      }| j                  rD|D cg c]9  }t
        j                  dt
        j                  | j                        z  d|      ; }}|S J d	       c c}w )
Nsexpr)r   	blanklineunindented_parenz^\()start_rez	(?m)^%s.* zbad block type)r   read_sexpr_blockr   read_blankline_blockread_regexp_blockresubescape)r   streamtokstoks       r   _read_blockz$BracketParseCorpusReader._read_block=   s    ')#F9K9KLL  K/'//  $66$Vf=D!!  $ FF;43E3E)FFCP  K&&&1s   ->B6c                 b    t         j                  dd|      }t         j                  dd|      }|S )Nz\((.)\)z(\1 \1)z"\(([^\s()]+) ([^\s()]+) [^\s()]+\)(\1 \2)r    r!   r   ts     r   
_normalizez#BracketParseCorpusReader._normalizeO   s,    FF:z1-FF8*aHr   c           	      n   	 t        j                  | j                  |            }|j                         dk(  rt	        |      dk(  r|d   S |S # t
        $ r}t        j                  j                  d       |j                  dk(  rkt        dd      D ]\  }	 t        | j                  |d|z  z               }t        j                  j                  d|z         |c cY d }~S # t
        $ r Y Zw xY w t        j                  j                  d	       t        d
| j                  |            cY d }~S d }~ww xY w)Nr      r   z(Bad tree detected; trying to recover...
)zmismatched parens   )z(  Recovered by adding %d close paren(s)
z'  Recovered by returning a flat parse.
S)r   
fromstringr,   labellen
ValueErrorsysstderrwriteargsrange_tag)r   r+   treeenvs         r   _parsezBracketParseCorpusReader._parseV   s   	+??4??1#56Dzz|r!c$i1nAw 	+JJHIvv//q! A S1W!=>

((JQN  !%  JJGHTYYq\**!	+sO   A	A A 	D4=D/AC D/D4 	C,)D/+C,,=D/)D4/D4c           
         t         j                  | j                  |            D cg c]	  \  }}||f }}}|r:|| j                  k7  r+|D cg c]  \  }}|t	        | j                  ||      f  }}}|S c c}}w c c}}w N)TAGWORDfindallr,   r   r   )r   r+   r   pwtagged_sents         r   r;   zBracketParseCorpusReader._tagq   s    ,3OODOOA<N,OP&1a1vPPf,DO:@1aGDLL&!45K   Qs   A7#A=c                 J    t         j                  | j                  |            S rB   )WORDrD   r,   r*   s     r   _wordzBracketParseCorpusReader._wordy   s    ||DOOA.//r   )Nr   utf8NrB   )
__name__
__module____qualname____doc__r
   r&   r,   r@   r;   rJ    r   r   r   r      s3     (6'$+60r   r   c                   f     e Zd ZdZd Zd	 fd	Zd	 fd	Zd	 fd	Zd
 fd	Zd
 fd	Z	d
 fd	Z
 xZS )#CategorizedBracketParseCorpusReaderz
    A reader for parsed corpora whose documents are
    divided into categories based on their file identifiers.
    @author: Nathan Schneider <nschneid@cs.cmu.edu>
    c                 b    t         j                  | |       t        j                  | g|i | y)at  
        Initialize the corpus reader.  Categorization arguments
        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
        the L{CategorizedCorpusReader constructor
        <CategorizedCorpusReader.__init__>}.  The remaining arguments
        are passed to the L{BracketParseCorpusReader constructor
        <BracketParseCorpusReader.__init__>}.
        N)CategorizedCorpusReaderr
   r   )r   r9   kwargss      r   r
   z,CategorizedBracketParseCorpusReader.__init__   s,     	 ((v6 ))$@@@r   c                 D    t         |   | j                  ||      |      S rB   )supertagged_words_resolver   r   
categoriesr   	__class__s       r   rX   z0CategorizedBracketParseCorpusReader.tagged_words        w#DMM':$FOOr   c                 D    t         |   | j                  ||      |      S rB   )rW   tagged_sentsrY   rZ   s       r   r_   z0CategorizedBracketParseCorpusReader.tagged_sents   r]   r   c                 D    t         |   | j                  ||      |      S rB   )rW   tagged_parasrY   rZ   s       r   ra   z0CategorizedBracketParseCorpusReader.tagged_paras   r]   r   c                 B    t         |   | j                  ||            S rB   )rW   parsed_wordsrY   r   r   r[   r\   s      r   rc   z0CategorizedBracketParseCorpusReader.parsed_words       w#DMM':$FGGr   c                 B    t         |   | j                  ||            S rB   )rW   parsed_sentsrY   rd   s      r   rg   z0CategorizedBracketParseCorpusReader.parsed_sents   re   r   c                 B    t         |   | j                  ||            S rB   )rW   parsed_parasrY   rd   s      r   ri   z0CategorizedBracketParseCorpusReader.parsed_paras   re   r   )NNN)NN)rL   rM   rN   rO   r
   rX   r_   ra   rc   rg   ri   __classcell__)r\   s   @r   rR   rR   }   s8    
APPPHHH Hr   rR   c                   .    e Zd ZdZddZddZd	dZd Zy)
AlpinoCorpusReadera  
    Reader for the Alpino Dutch Treebank.
    This corpus has a lexical breakdown structure embedded, as read by `_parse`
    Unfortunately this puts punctuation and some other words out of the sentence
    order in the xml element tree. This is no good for `tag_` and `word_`
    `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
    to the overridden _normalize function. The _parse function can then remain
    untouched.
    Nc                 :    t         j                  | |dd||       y )Nzalpino\.xmlr   )r   r   r   )r   r
   )r   r   r   r   s       r   r
   zAlpinoCorpusReader.__init__   s'     ))% 	* 	
r   c                 2   |dd dk7  ryt         j                  dd|      }|rt         j                  dd|      }nt         j                  d	d
|      }t         j                  dd|      }t         j                  dd|      }t         j                  dd|      }|S )a  Normalize the xml sentence element in t.
        The sentence elements <alpino_ds>, although embedded in a few overall
        xml elements, are separated by blank lines. That's how the reader can
        deliver them one at a time.
        Each sentence has a few category subnodes that are of no use to us.
        The remaining word nodes may or may not appear in the proper order.
        Each word node has attributes, among which:
        - begin : the position of the word in the sentence
        - pos   : Part of Speech: the Tag
        - word  : the actual word
        The return value is a string with all xml elementes replaced by
        clauses: either a cat clause with nested clauses, or a word clause.
        The order of the bracket clauses closely follows the xml.
        If ordered == True, the word clauses include an order sequence number.
        If ordered == False, the word clauses only have pos and word parts.
        N
   z
<alpino_dsr   z  <node .*? cat="(\w+)".*>z(\1z>  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>z
(\1 \2 \3)z-  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>r(   z	  </node>r0   z<sentence>.*</sentence>z</?alpino_ds.*>r)   )r   r+   ordereds      r   r,   zAlpinoCorpusReader._normalize   s    " Sb6\!FF0&!<QA GUVWAFF<q)FF-sA6FF%sA.r   c                    t         j                  | j                  |d            D cg c]  \  }}}t        |      ||f }}}}|j	                          |r?|| j
                  k7  r0|D cg c]  \  }}}|t        | j
                  ||      f! }}}}|S |D cg c]
  \  }}}||f }}}}|S c c}}}w c c}}}w c c}}}w )NT)rp   )
SORTTAGWRDrD   r,   intsortr   r   )r   r+   r   orE   rF   rG   s          r   r;   zAlpinoCorpusReader._tag   s     (//40PQ
 
Aq VQN
 
 	f,GR :C1aGDLL&!45K 
  4???iq!QAq6?K?
 @s   B4/$B;Cc                 Z    | j                  |      }|D cg c]  \  }}|	 c}}S c c}}w )z(Return a correctly ordered list if words)r;   )r   r+   rG   rF   rE   s        r   rJ   zAlpinoCorpusReader._word   s'    iil +,fq!,,,s   ')z
ISO-8859-1N)FrB   )rL   rM   rN   rO   r
   r,   r;   rJ   rP   r   r   rl   rl      s    
 D-r   rl   )rO   r6   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tagr   	nltk.treer   r    compilerr   rC   rI   EMPTY_BRACKETSr	   r   rT   rR   rl   rP   r   r   <module>r}      s     $ %   ZZ:;

**1
2	zz,-M*`01 `0F%H5%HPH-1 H-r   