U
    "c                     @   s:   d dl Z d dlZd dlmZ d dlmZ G dd dZdS )    N)	text_type)chainc                $   @   s  e Zd ZdZdddddddd	d
dg
ZddgZdddddddddddddddgZddddd d!gZd"d#d$d%d&d'd(d)d*dg
Zd+gZ	d,d-gZ
d.gZd/gZd0d1d2ddd3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdddJdKdLdMdNdOg$Zd\dSdTZdUdV ZdWdX ZdYdZ Zd[S )]MosesPunctNormalizerz
    This is a Python port of the Moses punctuation normalizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
    )z\r )z\(z ()z\)z) )z + )z\) ([.!:?;,])z)\g<1>)z\( ()z \)))z(\d) %z\g<1>%)z ::)z ;;)`')''z " )u   „")u   “r   )u   ”r   )u   –-)u   —z - )   ´r   )u   ([a-zA-Z])‘([a-zA-Z])\g<1>'\g<2>)u   ([a-zA-Z])’([a-zA-Z])r   )u   ‘r   )u   ‚r   )u   ’r   )r   r   )u   ´´r   )u   …z...)u    « r   )u   « r   )   «r   )u    » r   )u    »r   )   »r   )u    %%)u   nº u   nº )u    :r	   )u    ºCu    ºC)u    cmz cm)u    \??)u    \!!)u    ;r
   )u   , z, )z"([,.]+)z\g<1>")z,"z",)z(\.+)"(\s*[^<])z"\g<1>\g<2>)
   (\d) (\d)z\g<1>,\g<2>)r   z\g<1>.\g<2>)u   ，,)u   。\s*. )u   、r   )u   ∶r	   )u   ：r	   )u   ？r   )u   《r   )u   》r   )u   ）r   )u   ！r   )u   （r   )u   ；r
   )u   」r   )u   「r   )u   ０0)u   １1)u   ２2)u   ３3)u   ４4)u   ５5)u   ６6)u   ７7)u   ８8)u   ９9)u   ．\s*r   )u   ～~)u   ━r   )u   〈<)u   〉>)u   【[)u   】])u   ％r   enTFc                 C   s   | j | j| j| jg| _|r*| jd| j |r\|dkrF| j| j n|dkr\| j| j	 |r|dkrx| j| j
 n| j| j tt| j | _|| _|| _dS )ah  
        :param language: The two-letter language code.
        :type lang: str
        :param penn: Normalize Penn Treebank style quotations.
        :type penn: bool
        :param norm_quote_commas: Normalize quotations and commas
        :type norm_quote_commas: bool
        :param norm_numbers: Normalize numbers
        :type norm_numbers: bool
           r)   )deesfr)r+   r,   czcsr-   N)EXTRA_WHITESPACENORMALIZE_UNICODEFRENCH_QUOTESHANDLE_PSEUDO_SPACESsubstitutionsinsertNORMALIZE_UNICODE_IF_NOT_PENNappendEN_QUOTATION_FOLLOWED_BY_COMMA$DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMADE_ES_CZ_CS_FROTHERlistr   pre_replace_unicode_punctpost_remove_control_chars)selflangZpennZnorm_quote_commasZnorm_numbersr=   r>    rA   8/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/normalize.py__init__   s&    zMosesPunctNormalizer.__init__c                 C   sJ   | j r| |}| jD ]\}}t||t|}q| jrB| |}| S )z?
        Returns a string with normalized punctuation.
        )	r=   replace_unicode_punctr4   resubr   r>   remove_control_charsstripr?   textregexpZsubstitutionrA   rA   rB   	normalize   s    

zMosesPunctNormalizer.normalizec                 C   s&   | j D ]\}}t||t|}q|S )N)REPLACE_UNICODE_PUNCTUATIONrE   rF   r   rI   rA   rA   rB   rD      s    z*MosesPunctNormalizer.replace_unicode_punctc                 C   s   t dd|S )Nz\p{C}r   )regexrF   )r?   rJ   rA   rA   rB   rG      s    z)MosesPunctNormalizer.remove_control_charsN)r)   TTTFF)__name__
__module____qualname____doc__r0   r6   r1   r2   r3   r8   r9   r:   r;   rM   rC   rL   rD   rG   rA   rA   rA   rB   r      s   
)      
.r   )rE   rN   sixr   	itertoolsr   r   rA   rA   rA   rB   <module>   s   