
    Vh-                     J   d Z ddlZddl ddl  ej                  d      Z ej                  d      Z ej                  d      Z ej                  d      Z ej                  d      Z	 ej                  d	      Z
 ej                  d
      Z G d d      Z G d de      Zy)a  
CorpusReader for the Comparative Sentence Dataset.

- Comparative Sentence Dataset information -

Annotated by: Nitin Jindal and Bing Liu, 2006.
              Department of Computer Sicence
              University of Illinois at Chicago

Contact: Nitin Jindal, njindal@cs.uic.edu
         Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)

Distributed with permission.

Related papers:

- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
   Proceedings of the ACM SIGIR International Conference on Information Retrieval
   (SIGIR-06), 2006.

- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
   Proceedings of Twenty First National Conference on Artificial Intelligence
   (AAAI-2006), 2006.

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.
    N)*z^\*+$z<cs-[1234]>z</cs-[1234]>z
<cs-[123]>z<cs-4>z(\d)_((?:[\.\w\s/-](?!\d_))+)z\(([^\(]*)\)$c                   *    e Zd ZdZ	 	 	 	 	 	 ddZd Zy)
ComparisonzN
    A Comparison represents a comparative sentence and its constituents.
    Nc                 X    || _         || _        || _        || _        || _        || _        y)a]  
        :param text: a string (optionally tokenized) containing a comparison.
        :param comp_type: an integer defining the type of comparison expressed.
            Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
            4 (Non-gradable).
        :param entity_1: the first entity considered in the comparison relation.
        :param entity_2: the second entity considered in the comparison relation.
        :param feature: the feature considered in the comparison relation.
        :param keyword: the word or phrase which is used for that comparative relation.
        N)text	comp_typeentity_1entity_2featurekeyword)selfr   r   r	   r
   r   r   s          T/home/dcms/DCMS/lib/python3.12/site-packages/nltk/corpus/reader/comparative_sents.py__init__zComparison.__init__8   s.    & 	"      c                     dj                  | j                  | j                  | j                  | j                  | j
                  | j                        S )Nz]Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", feature="{}", keyword="{}"))formatr   r   r	   r
   r   r   )r   s    r   __repr__zComparison.__repr__R   s@    *
&IINNMMMMLLLL

	
r   )NNNNNN)__name__
__module____qualname____doc__r   r    r   r   r   r   3   s%     4
r   r   c                   j    e Zd ZdZeZ e       ddfdZddZddZ	d Z
ddZdd	Zd
 Zd Zd Zd Zy) ComparativeSentencesCorpusReadera  
    Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).

        >>> from nltk.corpus import comparative_sentences
        >>> comparison = comparative_sentences.comparisons()[0]
        >>> comparison.text # doctest: +NORMALIZE_WHITESPACE
        ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
        'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
        'had', '.']
        >>> comparison.entity_2
        'models'
        >>> (comparison.feature, comparison.keyword)
        ('rewind', 'more')
        >>> len(comparative_sentences.comparisons())
        853
    Nutf8c                 ^    t         j                  | |||       || _        || _        d| _        y)a  
        :param root: The root directory for this corpus.
        :param fileids: a list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: tokenizer for breaking sentences or paragraphs
            into words. Default: `WhitespaceTokenizer`
        :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
        :param encoding: the encoding that should be used to read the corpus.
        z
README.txtN)CorpusReaderr   _word_tokenizer_sent_tokenizer_readme)r   rootfileidsword_tokenizersent_tokenizerencodings         r   r   z)ComparativeSentencesCorpusReader.__init__t   s/    " 	dD'8<--#r   c                     || j                   }nt        |t              r|g}t        | j	                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )a  
        Return all comparisons in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            comparisons have to be returned.
        :return: the given file(s) as a list of Comparison objects.
        :rtype: list(Comparison)
        Tr%   )_fileids
isinstancestrconcatabspaths
CorpusView_read_comparison_blockr   r"   pathencfileids        r   comparisonsz,ComparativeSentencesCorpusReader.comparisons   sz     ?mmG%iG ,0==$+M 'T3 d&A&ACP
 	
s   )A1c                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      }|D ch c]  }|s|j	                          }}|S c c}}}w c c}w )a&  
        Return a set of all keywords used in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            keywords have to be returned.
        :return: the set of keywords and comparative phrases used in the corpus.
        :rtype: set(str)
        Tr'   )r+   r,   r-   _read_keyword_blocklower)r   r"   r0   r1   r2   all_keywordsr   keywords_sets           r   keywordsz)ComparativeSentencesCorpusReader.keywords   s      ,0==$+M 'T3 d&>&>M
 8DOGwOO Ps   )A1A8A8c                    g }| j                  d      5 }|j                         }ddd       j                  d      D ]5  }|r|j                  d      r|j	                  |j                                7 |S # 1 sw Y   TxY w)z
        Return the list of words and constituents considered as clues of a
        comparison (from listOfkeywords.txt).
        zlistOfkeywords.txtN
z//)openreadsplit
startswithappendstrip)r   r9   fpraw_textlines        r   keywords_readmez0ComparativeSentencesCorpusReader.keywords_readme   sz    
 YY+, 	!wwyH	!NN4( 	*D4??40OODJJL)	* 	! 	!s   A88Bc                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )ac  
        Return all sentences in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :return: all sentences of the corpus as lists of tokens (or as plain
            strings, if no word tokenizer is specified).
        :rtype: list(list(str)) or list(str)
        Tr'   )r+   r,   r-   _read_sent_blockr/   s        r   sentsz&ComparativeSentencesCorpusReader.sents   s[      ,0==$+M 'T3 d&;&;cJ
 	
   )Ac                     t        | j                  |dd      D cg c]$  \  }}}| j                  || j                  |      & c}}}      S c c}}}w )a)  
        Return all words and punctuation symbols in the corpus.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        Tr'   )r+   r,   r-   _read_word_blockr/   s        r   wordsz&ComparativeSentencesCorpusReader.words   s[      ,0==$+M 'T3 d&;&;cJ
 	
rI   c                    	 |j                         }|sg S t        j                  t        |      }|rt        j                  t        |      }t        j                  t
        |      }|j                         j                         }| j                  r| j                  j                  |      }|j                          g }|r|D ]  }t        t        j                  d|      j                  d            }	t        ||	      }
|j                         }t        j                  |      }|r[|D ]V  \  }}|dk(  r|j                         |
_        !|dk(  r|j                         |
_        <|dk(  sB|j                         |
_        X t"        j                  |      }|r
|d   |
_        |j'                  |
        |rS|D ]N  }t        t        j                  d|      j                  d            }	t        ||	      }
|j'                  |
       P |S )Nz	<cs-(\d)>   )r   r   123r   )readlinerefindall
COMPARISONGRAD_COMPARISONNON_GRAD_COMPARISONrA   r   tokenizeintmatchgroupr   ENTITIES_FEATSr	   r
   r   KEYWORDr   r@   )r   streamrD   comparison_tagsgrad_comparisonsnon_grad_comparisonscomparison_textcomparison_bundlecompr   
comparisonentities_featscodeentity_featr   s                  r   r.   z7ComparativeSentencesCorpusReader._read_comparison_block   s   ??$D	 jjT:O#%::ot#D ')zz2Et'L$"(//"3"9"9";''&*&:&:&C&CO&TO! %'!# 0 =$'t(D(J(J1(M$N	%/!0I&
  &0)7)?)?)E)5C M 1k#'3;:E:K:K:MJ$7%)S[:E:K:K:MJ$7%)S[9D9J9J9LJ$6M #*//$"7"18J.)00<%=* ( 4 =$'t(D(J(J1(M$N	%/!0I&
 *00<= )(e r   c                 l    g }| j                  |      D ]  }|j                  |j                          |S N)r.   r@   r   )r   r^   r9   re   s       r   r5   z4ComparativeSentencesCorpusReader._read_keyword_block  s8    55f= 	0JOOJ../	0r   c                 (   	 |j                         }t        j                  t        |      r.	 |j                         }t        j                  t        |      rn,Yt        j                  t
        |      st        j	                  |      st        j                  t        |      sj| j                  rB| j                  j                  |      D cg c]  }| j                  j                  |       c}S | j                  j                  |      gS c c}w rj   )rR   rS   rZ   STARSrT   rU   r\   CLOSE_COMPARISONr   rX   r   )r   r^   rD   sents       r   rG   z1ComparativeSentencesCorpusReader._read_sent_block  s    ??$Dxxt$!??,Dxxt,  JJz40&..t4

#3T:'' %)$8$8$A$A$$G  ,,55d; 
 !0099$?@@' s   "Dc                 X    g }| j                  |      D ]  }|j                  |        |S rj   )rG   extend)r   r^   rL   rn   s       r   rK   z1ComparativeSentencesCorpusReader._read_word_block1  s2    ))&1 	DLL	r   rj   )r   r   r   r   StreamBackedCorpusViewr-   WhitespaceTokenizerr   r3   r9   rE   rH   rL   r.   r5   rG   rK   r   r   r   r   r   `   sP    " (J +,$,
(&
"
 3)jA,r   r   )r   rS   nltk.corpus.reader.apinltk.tokenizecompilerl   rU   rm   rV   rW   r\   r]   r   r   r   r   r   r   <module>rv      s   8 
 $  	

8RZZ'
2::o. "**]+ bjj+ <=
"**%
&*
 *
ZU| Ur   