1.准备足够大的训练语料,确定期望的subword词表大小; 2.准备基础词表:比如英文中26个字母加上各种符号; 3.基于基础词表将语料中的单词拆分为字符序列并在末尾添加后缀“ </ w>”;本阶段的subword的粒度是字符。例如单词“ low”的频率为5,那么我们将其改写为“ l o w </ w>”:5; 4.统计每一个连续字节对的出现频率,选择最高频的字符对合并成新的subword; 5.重复第4步直到达到第1步设定的subword词表大小或下一个最高频的字节对出现频率为1;
'''
https://leimao.github.io/blog/Byte-Pair-Encoding/
'''importre,collectionsdefget_vocab(filename):vocab=collections.defaultdict(int)withopen(filename,'r',encoding='utf-8')asfhand:forlineinfhand:words=line.strip().split()forwordinwords:vocab[' '.join(list(word))+' </w>']+=1returnvocabdefget_stats(vocab):pairs=collections.defaultdict(int)forword,freqinvocab.items():symbols=word.split()foriinrange(len(symbols)-1):pairs[symbols[i],symbols[i+1]]+=freqreturnpairsdefmerge_vocab(pair,v_in):v_out={}bigram=re.escape(' '.join(pair))p=re.compile(r'(?<!\S)'+bigram+r'(?!\S)')forwordinv_in:w_out=p.sub(''.join(pair),word)v_out[w_out]=v_in[word]returnv_outdefget_tokens_from_vocab(vocab):tokens_frequencies=collections.defaultdict(int)vocab_tokenization={}forword,freqinvocab.items():word_tokens=word.split()fortokeninword_tokens:tokens_frequencies[token]+=freqvocab_tokenization[''.join(word_tokens)]=word_tokensreturntokens_frequencies,vocab_tokenizationdefmeasure_token_length(token):iftoken[-4:]=='</w>':returnlen(token[:-4])+1else:returnlen(token)deftokenize_word(string,sorted_tokens,unknown_token='</u>'):ifstring=='':return[]ifsorted_tokens==[]:return[unknown_token]string_tokens=[]foriinrange(len(sorted_tokens)):token=sorted_tokens[i]token_reg=re.escape(token.replace('.','[.]'))matched_positions=[(m.start(0),m.end(0))forminre.finditer(token_reg,string)]iflen(matched_positions)==0:continuesubstring_end_positions=[matched_position[0]formatched_positioninmatched_positions]substring_start_position=0forsubstring_end_positioninsubstring_end_positions:substring=string[substring_start_position:substring_end_position]string_tokens+=tokenize_word(string=substring,sorted_tokens=sorted_tokens[i+1:],unknown_token=unknown_token)string_tokens+=[token]substring_start_position=substring_end_position+len(token)remaining_substring=string[substring_start_position:]string_tokens+=tokenize_word(string=remaining_substring,sorted_tokens=sorted_tokens[i+1:],unknown_token=unknown_token)breakreturnstring_tokens# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}vocab=get_vocab('../data/pg16457.txt')print('==========')print('Tokens Before BPE')tokens_frequencies,vocab_tokenization=get_tokens_from_vocab(vocab)print('All tokens: {}'.format(tokens_frequencies.keys()))print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))print('==========')num_merges=10000foriinrange(num_merges):pairs=get_stats(vocab)ifnotpairs:breakbest=max(pairs,key=pairs.get)vocab=merge_vocab(best,vocab)print('Iter: {}'.format(i))print('Best pair: {}'.format(best))tokens_frequencies,vocab_tokenization=get_tokens_from_vocab(vocab)print('All tokens: {}'.format(tokens_frequencies.keys()))print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))print('==========')# Let's check how tokenization will be for a known wordword_given_known='mountains</w>'word_given_unknown='Ilikeeatingapples!</w>'sorted_tokens_tuple=sorted(tokens_frequencies.items(),key=lambdaitem:(measure_token_length(item[0]),item[1]),reverse=True)sorted_tokens=[tokenfor(token,freq)insorted_tokens_tuple]print(sorted_tokens)word_given=word_given_knownprint('Tokenizing word: {}...'.format(word_given))ifword_giveninvocab_tokenization:print('Tokenization of the known word:')print(vocab_tokenization[word_given])print('Tokenization treating the known word as unknown:')print(tokenize_word(string=word_given,sorted_tokens=sorted_tokens,unknown_token='</u>'))else:print('Tokenizating of the unknown word:')print(tokenize_word(string=word_given,sorted_tokens=sorted_tokens,unknown_token='</u>'))word_given=word_given_unknownprint('Tokenizing word: {}...'.format(word_given))ifword_giveninvocab_tokenization:print('Tokenization of the known word:')print(vocab_tokenization[word_given])print('Tokenization treating the known word as unknown:')print(tokenize_word(string=word_given,sorted_tokens=sorted_tokens,unknown_token='</u>'))else:print('Tokenizating of the unknown word:')print(tokenize_word(string=word_given,sorted_tokens=sorted_tokens,unknown_token='</u>'))'''
Tokenizing word: mountains</w>...
Tokenization of the known word:
['mountains</w>']
Tokenization treating the known word as unknown:
['mountains</w>']
Tokenizing word: Ilikeeatingapples!</w>...
Tokenizating of the unknown word:
['I', 'like', 'ea', 'ting', 'app', 'l', 'es!</w>']
'''