U
    "c                      @   sP   d dl mZ d dlZd dlmZmZ d dlmZ d dlm	Z	 G dd de
ZdS )    )print_functionN)Counterdefaultdict)reduce)pairwisec                   @   sL   e Zd ZdZdd ZdddZdd Zd	d
 Zdd Zdd Z	dddZ
dS )SubwordTokenizerzd
    This is a Python port of the Subword NMT from
    https://github.com/rsennrich/subword-nmt
    c                 C   s.   |  || _|  \| _| _t| j| _d S N)get_vocabularyvocabget_pair_statisticsstatsindicescopydeepcopyZ	big_stats)selffilename r   7/tmp/pip-unpacked-wheel-43qgbez0/sacremoses/subwords.py__init__   s    zSubwordTokenizer.__init__Fc              	   C   s   t  }t|N}|rF|D ]*}| d\}}||  t|7  < qn||   W 5 Q R X t dd | D }| S )N c                 S   s0   i | ](\}}t |d d |d d f |qS )Nu   )tuple).0kvr   r   r   
<dictcomp>$   s      z3SubwordTokenizer.get_vocabulary.<locals>.<dictcomp>)	r   openstripsplitintupdatereaditemsmost_common)r   r   is_dictr
   Zfinlinewordcountr   r   r   r	      s    
zSubwordTokenizer.get_vocabularyc                 C   sp   t  }tdd }t| jD ]J\}\}}t|D ]4\}}|||f  |7  < |||f |  d7  < q0q||fS )z5Count frequency of all symbol pairs, and create indexc                   S   s   t  S r   )r   r   r   r   r   <lambda>-       z6SubwordTokenizer.get_pair_statistics.<locals>.<lambda>   )r   r   	enumerater
   r   )r   r   r   ir&   freqprevcurrr   r   r   r   (   s    z$SubwordTokenizer.get_pair_statisticsc                    sD   |\ d |dd fdd}t||dd |d fS )	z
        From https://stackoverflow.com/a/40367074/610569
            >>> modify_token(('s', 'h', 'e', 'r', 'l', 'o', 'c', 'k'), ('h', 'e'))
            ('S', 'he', 'r', 'l', 'o', 'c', 'k')
         \z\\c                    s0   | d  kr&|kr&| d d f S | |f S )Nr   r   )accefirstZpair_strsecondr   r   r(   ?   s
    
z/SubwordTokenizer.modify_token.<locals>.<lambda>r*   Nr   )joinreplacer   )r   tokenpairfr   r4   r   modify_token6   s
    zSubwordTokenizer.modify_tokenc                 C   sd   g }| j |  D ]L\}}|dk r$q| j| \}}| ||}||f| j|< |||||f q|S )zJReplace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'r*   )r   r"   r
   r<   append)r   r:   changesjr-   r&   Znew_wordr   r   r   replace_pairE   s    zSubwordTokenizer.replace_pairc              	   C   sD  d| j |< t | j|< |\}}|| }|D ]\}}}}	d}
z|||
}
W n tk
rf   Y qdY nX |
t|d k rZ||
d  |krZ|
r||
d |
d  }| j |  |	8  < | j| |  d8  < |
t|d k rP||
d  |ks|
t|d ks||
d  |krP||
d |
d  }| j |  |	8  < | j| |  d8  < |
d7 }
q<|
d7 }
q<d}
z|||
}
W n tk
r   Y q*Y nX |
r||
d |
d  }| j |  |	7  < | j| |  d7  < |
t|d k r2||
d  |kr2||
|
d  }| j |  |	7  < | j| |  d7  < |
d7 }
qhq*dS )z
        Minimally update the indices and frequency of symbol pairs
        if we merge a pair of symbols, only pairs that overlap with occurrences
        of this pair are affected, and need to be updated.
        r   r*         N)r   r   r   index
ValueErrorlen)r   r:   changedr5   r6   Znew_pairr?   r&   Zold_wordr-   r,   r.   Znexr   r   r   update_pair_statisticsQ   sT    
$



$z'SubwordTokenizer.update_pair_statisticsrA   r*   Nc                 C   st   t | j d }t|D ]T}| j|}|D ]>\}}	| |}
| ||
 d| j|< | j| |k r.  d S q.qd S )N
   r   )maxr   valuesranger#   r@   rG   )r   Znum_symbolsZmin_freqZjumpr$   	thresholdr,   Zmost_freq_tokensr9   r'   r>   r   r   r   learn   s    

zSubwordTokenizer.learn)F)rA   r*   N)__name__
__module____qualname____doc__r   r	   r   r<   r@   rG   rM   r   r   r   r   r      s   
[r   )
__future__r   r   collectionsr   r   	functoolsr   Zsacremoses.utilr   objectr   r   r   r   r   <module>   s
   