Ë
    ÎÆVh›  ã                   ó<   — d Z ddlZddl­ ddl­  G d„ dee«      Zy)aˆ  
CorpusReader for the Pros and Cons dataset.

- Pros and Cons dataset information -

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

Related papers:

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.

- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
    Opinions on the Web". Proceedings of the 14th international World Wide Web
    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
é    N)Ú*c                   óF   — e Zd ZdZeZ e«       dfd„Zd	d„Zd	d„Z	d„ Z
d„ Zy)
ÚProsConsCorpusReaderaÒ  
    Reader for the Pros and Cons sentence dataset.

        >>> from nltk.corpus import pros_cons
        >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
        ...]
        >>> pros_cons.words('IntegratedPros.txt')
        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
    Úutf8c                 ón   — t         j                  | |||«       t        j                  | |«       || _        y)aµ  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WhitespaceTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        :param kwargs: additional parameters passed to CategorizedCorpusReader.
        N)ÚCorpusReaderÚ__init__ÚCategorizedCorpusReaderÚ_word_tokenizer)ÚselfÚrootÚfileidsÚword_tokenizerÚencodingÚkwargss         úL/home/dcms/DCMS/lib/python3.12/site-packages/nltk/corpus/reader/pros_cons.pyr	   zProsConsCorpusReader.__init__1   s0   € ô" 	×Ñ˜d D¨'°8Ô<Ü×(Ñ(¨¨vÔ6Ø-ˆÕó    Nc                 ó  — | j                  ||«      }|€| j                  }nt        |t        «      r|g}t	        | j                  |dd«      D cg c]$  \  }}}| j                  || j                  |¬«      ‘Œ& c}}}«      S c c}}}w )aè  
        Return all sentences in the corpus or in the specified files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :param categories: a list specifying the categories whose sentences
            have to be returned.
        :return: the given file(s) as a list of sentences. Each sentence is
            tokenized using the specified word_tokenizer.
        :rtype: list(list(str))
        T©r   )Ú_resolveÚ_fileidsÚ
isinstanceÚstrÚconcatÚabspathsÚ
CorpusViewÚ_read_sent_block©r   r   Ú
categoriesÚpathÚencÚfileids         r   ÚsentszProsConsCorpusReader.sentsF   óŠ   € ð —-‘- ¨Ó4ˆØˆ?Ø—m‘m‰GÜ˜¤Ô%ØiˆGÜð ,0¯=©=¸À$ÈÓ+M÷ð á'T˜3 ð —‘  d×&;Ñ&;ÀcÕJôó
ð 	
ùôó   Á)Bc                 ó  — | j                  ||«      }|€| j                  }nt        |t        «      r|g}t	        | j                  |dd«      D cg c]$  \  }}}| j                  || j                  |¬«      ‘Œ& c}}}«      S c c}}}w )a¿  
        Return all words and punctuation symbols in the corpus or in the specified
        files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :param categories: a list specifying the categories whose words have
            to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        Tr   )r   r   r   r   r   r   r   Ú_read_word_blockr   s         r   ÚwordszProsConsCorpusReader.words^   r$   r%   c                 ó  — g }t        d«      D ]u  }|j                  «       }|sŒt        j                  d|«      }|sŒ/|j	                  | j
                  j                  |j                  d«      j                  «       «      «       Œw |S )Né   z+^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>é   )	ÚrangeÚreadlineÚreÚmatchÚappendr   ÚtokenizeÚgroupÚstrip)r   Ústreamr#   ÚiÚlineÚsents         r   r   z%ProsConsCorpusReader._read_sent_blockv   sw   € ØˆÜr“ò 	SˆAØ—?‘?Ó$ˆDÙØÜ—8‘8ÐJÈDÓQˆDÚØ—‘˜T×1Ñ1×:Ñ:¸4¿:¹:Àa»=×;NÑ;NÓ;PÓQÕRð	Sð ˆr   c                 óX   — g }| j                  |«      D ]  }|j                  |«       Œ |S )N)r   Úextend)r   r4   r(   r7   s       r   r'   z%ProsConsCorpusReader._read_word_block   s2   € ØˆØ×)Ñ)¨&Ó1ò 	ˆDØL‰L˜Õð	àˆr   )NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚStreamBackedCorpusViewr   ÚWordPunctTokenizerr	   r#   r(   r   r'   © r   r   r   r   "   s2   „ ñ
ð (€Jñ *Ó+Øó.ó*
ó0
ò0	ór   r   )r=   r.   Únltk.corpus.reader.apiÚnltk.tokenizer
   r   r   r@   r   r   ú<module>rC      s&   ðñó( 
ä $Ü ôcÐ2°Lõ cr   