U
    "c%                     @   s8   d dl Z d dlZG dd dZG dd dZddgZdS )    Nc                   @   s"   e Zd ZdZdd ZdddZdS )Perlunipropsa3  
    This class is used to read lists of characters from the Perl Unicode
    Properties (see http://perldoc.perl.org/perluniprops.html).
    The files in the perluniprop.zip are extracted using the Unicode::Tussle
    module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
    c                 C   sP   t jt jtd | _dddddddd	d
dddddddddddddg| _d S )Nz/data/perluniprops/ZClose_PunctuationZCurrency_SymbolZIsAlnumZIsAlphaZIsLowerZIsNZIsScZIsSoZIsUpperZLine_SeparatorNumberZOpen_PunctuationZPunctuationZ	SeparatorZSymbolZLowercase_LetterZTitlecase_LetterZUppercase_LetterZIsPfZIsPiZ
CJKSymbolsCJK)ospathdirnameabspath__file__datadirZavailable_categoriesself r   5/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/corpus.py__init__   s2    zPerluniprops.__init__Nc                 c   s:   t jdd|d }td|}|dD ]
}|V  q*dS )u  
        This module returns a list of characters from  the Perl Unicode Properties.
        They are very useful when porting Perl tokenizers to Python.

            >>> from sacremoses.corpus import Perluniprops
            >>> pup = Perluniprops()
            >>> list(pup.chars('Open_Punctuation'))[:5] == [u'(', u'[', u'{', u'༺', u'༼']
            True
            >>> list(pup.chars('Currency_Symbol'))[:5] == [u'$', u'¢', u'£', u'¤', u'¥']
            True
            >>> pup.available_categories[:5]
            ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower']

        :return: a generator of characters given the specific unicode character category
        dataZperlunipropsz.txt
sacremosesutf-8N)r   r   joinpkgutilget_datadecode)r   categoryrelative_pathbinary_datachr   r   r   chars.   s    zPerluniprops.chars)N)__name__
__module____qualname____doc__r   r   r   r   r   r   r      s   r   c                   @   s"   e Zd ZdZdd ZdddZdS )	NonbreakingPrefixesz
    This is a class to read the nonbreaking prefixes textfiles from the
    Moses Machine Translation toolkit. These lists are used in the Python port
    of the Moses' word tokenizer.
    c              '   C   s   t jt jtd | _dddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(&| _| jd)d* | j D  d S )+Nz/data/nonbreaking_prefixes/asZbncacsdeelenesetfifrgaZguhihuisitknltlvmlZmnimrnlorpaplptroruskslsvtateZtdtZyuezh)&ZassameseZbengalicatalanczechgermangreekenglishspanishestonianfinnishfrenchZirishZgujaratiZhindi	hungarian	icelandicitalianZkannada
lithuanianZlatvianZ	malayalamZmanipuriZmarathidutchZoriyaZpunjabipolish
portugueseromanianrussianslovak	slovenianswedishZtamilZteluguZtetumZ	cantonesechinesec                 S   s   i | ]
}||qS r   r   .0vr   r   r   
<dictcomp>x   s      z0NonbreakingPrefixes.__init__.<locals>.<dictcomp>)	r   r   r   r   r	   r
   available_langsupdatevaluesr   r   r   r   r   K   sT    )zNonbreakingPrefixes.__init__N#c                 c   s   || j krd| j |  g}n(|dkr>dd t| j  D }ndg}|D ]P}tjdd|}td|}|d		 D ] }|
 }|rv||sv|V  qvqHdS )
u  
        This module returns a list of nonbreaking prefixes for the specified
        language(s).

            >>> from sacremoses.corpus import NonbreakingPrefixes
            >>> nbp = NonbreakingPrefixes()
            >>> list(nbp.words('en'))[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
            True
            >>> list(nbp.words('ta'))[:5] == ['ர', 'ூ', 'திரு', 'ஏ', 'பீ']
            True

        :return: a generator words for the specified language(s).
        nonbreaking_prefix.Nc                 S   s   g | ]}d | qS )r`   r   rX   r   r   r   
<listcomp>   s    z-NonbreakingPrefixes.words.<locals>.<listcomp>znonbreaking_prefix.enr   Znonbreaking_prefixesr   r   )r\   setr^   r   r   r   r   r   r   
splitlinesstrip
startswith)r   langZignore_lines_startswith	filenamesfilenamer   r   liner   r   r   wordsz   s    
zNonbreakingPrefixes.words)Nr_   )r   r   r   r   r   rj   r   r   r   r   r    D   s   /r    )r   r   r   r    __all__r   r   r   r   <module>   s   <X