tkitAutoTokenizerPosition.AutoTokenizerPosition 源代码

# -*- coding: utf-8 -*-
import unicodedata
import regex as re
[文档]class AutoTokenizerPosition: """ 用来处理只有关键词的ner数据 起始位置 tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") 使用这个可以减少不必要的麻烦 ## 安装 ``` > pip install tkitAutoTokenizerPosition # or > pip install git+ ``` """ def __init__(self,tokenizer): """[summary] ``` tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny") tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") ``` 使用这个可以减少不必要的麻烦 Args: tokenizer ([type]): [description] """ self.tokenizer=tokenizer pass
[文档] def E_trans_to_C(self,string): """[summary] 中文标点转换成英文 Args: string ([type]): [description] Returns: [type]: [description] """ E_pun = u',.!?[]()<>"\'' C_pun = u',。!?【】()《》“‘' table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)} return string.translate(table)
[文档] def filterPunctuation(self,x): """[summary] 中文标点转换成英文 Args: x ([type]): [description] Returns: [type]: [description] """ x = re.sub(r'[‘’]', "'", x) x = re.sub(r'[“”]', '"', x) x = re.sub(r'[…]', '...', x) x = re.sub(r'[—]', '-', x) x = re.sub(r"&nbsp", "", x) return x
[文档] def clear(self,text): """[summary] 清理文本中文问题 Args: text ([type]): [description] """ text=text.lower() # 中文标点转换英文 text=unicodedata.normalize('NFKD',text) text=self.filterPunctuation(text) text=text.replace("\t",self.tokenizer.pad_token).replace(" ",self.tokenizer.pad_token) text=text.replace("\n",self.tokenizer.sep_token).replace("\r",self.tokenizer.sep_token) return text
[文档] def getWordList(self,text): """[summary] 分词列表 Args: text ([type]): [description] """ text=self.clear(text) return self.tokenizer.tokenize(text)
[文档] def getText(self,wordList): for i,w in enumerate(wordList): wordList[i]=w.replace("##", "") return "".join(wordList) pass
[文档] def autoLen(self,text): """[summary] 获取文本分词后位置 Args: text ([type]): [description] """ text=text.lower() # word=word.lower() realLen=len(self.getWordList(text)) return realLen
[文档] def findAll(self,text, word): """[summary] 获取词语在文字中的所有开始位置 Args: text ([type]): [description] word ([type]): [description] Yields: [type]: [description] """ text=text.lower() word=word.lower() idx = text.find(word) while idx != -1: yield idx idx = text.find(word, idx + 1)
[文档] def fixPosition(self,text,word,startList=[]): """[summary] 自动获取分词后起始位置 自动匹配所有存在的位置 传入位置可以限制查找的位置 Args: text ([type]): [description] word ([type]): [description] startList (list, optional): [description]. Defaults to []. Yields: [type]: [description] """ # print(text,word) text=text.lower() word=word.lower() if len(startList) ==0: startList=self.findAll(text, word) for start in startList: s_start=self.autoLen(text[:start]) # print("s_start",s_start) startLen=self.autoLen(word) # print("s_end", s_start,s_start+startLen) yield s_start,s_start+startLen
[文档] def autoTypeWord(self,text,word,wType=None,startList=[]): """[summary] Args: text ([type]): [description] word ([type]): [description] wType ([type], optional): [description]. Defaults to None. startList (list, optional): [description]. Defaults to []. """ for s_start,s_end in self.fixPosition(text,word,startList=[]): # print(s_start,s_end) # WordList=self.getWordList(it['text']) yield s_start,s_end,wType