tkitAutoTokenizerPosition.AutoTokenizerPosition 源代码

# -*- coding: utf-8 -*-
import unicodedata
import regex as re
[文档]class AutoTokenizerPosition:
    """
    用来处理只有关键词的ner数据
    
    起始位置
    tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny")
    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    使用这个可以减少不必要的麻烦
    ## 安装

    ```

    > pip install tkitAutoTokenizerPosition
    
    # or
    
    > pip install git+https://github.com/napoler/tkit-AutoTokenizerPosition

    ```
    
    
    """
    def __init__(self,tokenizer):
        """[summary]
        
        ```
        tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
        ```
        使用这个可以减少不必要的麻烦

        Args:
            tokenizer ([type]): [description]
        """
        self.tokenizer=tokenizer
        pass
[文档]    def E_trans_to_C(self,string):
        """[summary]
        
        中文标点转换成英文

        Args:
            string ([type]): [description]

        Returns:
            [type]: [description]
        """
        E_pun = u',.!?[]()<>"\''
        C_pun = u'，。！？【】（）《》“‘'
        table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}
        return string.translate(table)
    
    


[文档]    def filterPunctuation(self,x):
        """[summary]
        
        中文标点转换成英文

        Args:
            x ([type]): [description]

        Returns:
            [type]: [description]
        """
        x = re.sub(r'[‘’]', "'", x)
        x = re.sub(r'[“”]', '"', x)
        x = re.sub(r'[…]', '...', x)
        x = re.sub(r'[—]', '-', x)
        x = re.sub(r"&nbsp", "", x)
        return x
[文档]    def clear(self,text):
        """[summary]

        清理文本中文问题

        Args:
        text ([type]): [description]
        """
        text=text.lower()
        # 中文标点转换英文
        text=unicodedata.normalize('NFKD',text)
        text=self.filterPunctuation(text)
        text=text.replace("\t",self.tokenizer.pad_token).replace(" ",self.tokenizer.pad_token)
        text=text.replace("\n",self.tokenizer.sep_token).replace("\r",self.tokenizer.sep_token)
        return text
[文档]    def getWordList(self,text):
        """[summary]

        分词列表

        Args:
        text ([type]): [description]
        """
        text=self.clear(text)
        return self.tokenizer.tokenize(text)
[文档]    def getText(self,wordList):
        for i,w in enumerate(wordList):
            wordList[i]=w.replace("##", "")
        return "".join(wordList)
            
        
        pass
[文档]    def autoLen(self,text):
        """[summary]
        
        获取文本分词后位置

        Args:
            text ([type]): [description]
        """
        text=text.lower()
#         word=word.lower()
        realLen=len(self.getWordList(text))
        return realLen
    
    
[文档]    def findAll(self,text, word):
        """[summary]
        
        获取词语在文字中的所有开始位置

        Args:
            text ([type]): [description]
            word ([type]): [description]

        Yields:
            [type]: [description]
        """
        text=text.lower()
        word=word.lower()
        idx = text.find(word)
        while idx != -1:
            yield idx
            idx = text.find(word, idx + 1)
[文档]    def fixPosition(self,text,word,startList=[]):
        """[summary]
        
        自动获取分词后起始位置
        自动匹配所有存在的位置
        
        传入位置可以限制查找的位置

        Args:
            text ([type]): [description]
            word ([type]): [description]
            startList (list, optional): [description]. Defaults to [].

        Yields:
            [type]: [description]
        """
#         print(text,word)
        text=text.lower()
        word=word.lower()
        if len(startList) ==0:
            startList=self.findAll(text, word)
        for start in startList:
            s_start=self.autoLen(text[:start])
        #     print("s_start",s_start)
            startLen=self.autoLen(word)
            # print("s_end", s_start,s_start+startLen)  
            yield s_start,s_start+startLen
[文档]    def autoTypeWord(self,text,word,wType=None,startList=[]):
        """[summary]

        Args:
            text ([type]): [description]
            word ([type]): [description]
            wType ([type], optional): [description]. Defaults to None.
            startList (list, optional): [description]. Defaults to [].
        """
        for s_start,s_end in self.fixPosition(text,word,startList=[]):
#             print(s_start,s_end)
#             WordList=self.getWordList(it['text'])
            yield s_start,s_end,wType