
    Vh                     <    d Z ddlZddl ddl  G d dee      Zy)a  
CorpusReader for the Pros and Cons dataset.

- Pros and Cons dataset information -

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

Related papers:

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.

- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
    Opinions on the Web". Proceedings of the 14th international World Wide Web
    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
    N)*c                   F    e Zd ZdZeZ e       dfdZd	dZd	dZ	d Z
d Zy)
ProsConsCorpusReadera  
    Reader for the Pros and Cons sentence dataset.

        >>> from nltk.corpus import pros_cons
        >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
        ...]
        >>> pros_cons.words('IntegratedPros.txt')
        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
    utf8c                 n    t         j                  | |||       t        j                  | |       || _        y)a  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WhitespaceTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        :param kwargs: additional parameters passed to CategorizedCorpusReader.
        N)CorpusReader__init__CategorizedCorpusReader_word_tokenizer)selfrootfileidsword_tokenizerencodingkwargss         L/home/dcms/DCMS/lib/python3.12/site-packages/nltk/corpus/reader/pros_cons.pyr	   zProsConsCorpusReader.__init__1   s0    " 	dD'8<((v6-    Nc                    | j                  ||      }|| j                  }nt        |t              r|g}t	        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )a  
        Return all sentences in the corpus or in the specified files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :param categories: a list specifying the categories whose sentences
            have to be returned.
        :return: the given file(s) as a list of sentences. Each sentence is
            tokenized using the specified word_tokenizer.
        :rtype: list(list(str))
        Tr   )_resolve_fileids
isinstancestrconcatabspaths
CorpusView_read_sent_blockr   r   
categoriespathencfileids         r   sentszProsConsCorpusReader.sentsF        --4?mmG%iG ,0==$+M 'T3 d&;&;cJ
 	
   )Bc                    | j                  ||      }|| j                  }nt        |t              r|g}t	        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )a  
        Return all words and punctuation symbols in the corpus or in the specified
        files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :param categories: a list specifying the categories whose words have
            to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        Tr   )r   r   r   r   r   r   r   _read_word_blockr   s         r   wordszProsConsCorpusReader.words^   r$   r%   c                    g }t        d      D ]u  }|j                         }|st        j                  d|      }|s/|j	                  | j
                  j                  |j                  d      j                                      w |S )N   z+^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>   )	rangereadlinerematchappendr   tokenizegroupstrip)r   streamr#   ilinesents         r   r   z%ProsConsCorpusReader._read_sent_blockv   sw    r 	SA??$D88JDQDT11::4::a=;N;N;PQR	S r   c                 X    g }| j                  |      D ]  }|j                  |        |S )N)r   extend)r   r4   r(   r7   s       r   r'   z%ProsConsCorpusReader._read_word_block   s2    ))&1 	DLL	r   )NN)__name__
__module____qualname____doc__StreamBackedCorpusViewr   WordPunctTokenizerr	   r#   r(   r   r'    r   r   r   r   "   s2    
 (J *+.*
0
0	r   r   )r=   r.   nltk.corpus.reader.apinltk.tokenizer
   r   r   r@   r   r   <module>rC      s&   ( 
 $ c2L cr   