tkitSeg.seg 源代码

from transformers import BertTokenizer
import onnxruntime
import onnx,os
import numpy as np
# import os
# # tokenizer = BertTokenizer.from_pretrained("out/")
# # tokenizer.save_pretrained("out")
# # print(dir(tokenizer))

# path =os.path.dirname(tkitSeg.__file__)
# print(path)
[文档]def to_numpy(tensor):
    """[summary]
    生成np数据
    Args:
        tensor ([type]): [description]

    Returns:
        [type]: [description]
    """
    # return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    return np.array(tensor)
# def softmax(x):

#     f_x = np.exp(x) / np.sum(np.exp(x))
#     return f_x


[文档]def softmax(x):
    """[summary]
    
    转化成为概率

    Args:
        x ([type]): [description]

    Returns:
        [type]: [description]
    """

    # returns max of each row and keeps same dims
    max = np.max(x, axis=-1, keepdims=True)
    e_x = np.exp(x - max)  # subtracts each row with its max value
    # returns sum of each row and keeps same dims
    sum = np.sum(e_x, axis=-1, keepdims=True)
    f_x = e_x / sum
    max = np.max(x, axis=-1, keepdims=True)
    return f_x, np.argmax(x, axis=-1)


# def bulidToken_type_ids(attention_mask, Token_type=99):
#     """
#     构建新的类型矩阵
#     Pytorch 替换tensor中大于某个值的所有元素
#     https://blog.csdn.net/lxb206/article/details/103893961

#     """
#     b = torch.full(list(attention_mask.size()), 99)
#     out = torch.where(attention_mask > 0, b, attention_mask)
#     return out





# np 批量修改
# https://blog.csdn.net/lxb206/article/details/103893961


[文档]class tkitSeg:
    """[summary]
    
    分词和词性标注
    
    """
    def __init__(self,path="./"):
        self.labels = ['zzz','n','t',
            's',
            'f',
            'm',
            'q',
            'b',
            'r',
            'v',
            'a',
            'z',
            'd',
            'p',
            'c',
            'u',
            'y',
            'e',
            'o',
            'i',
            'l',
            'j',
            'h',
            'k',
            'g',
            'x',
            'w',
            'nr',
            'ns',
            'nt',
            'nx',
            'nz',
            'vd',
            'vn',
            'vx',
            'ad',
            'an']
        # path =os.path.dirname(tkitSeg.__file__)
        
        if os.path.exists(os.path.join(path,"model_troch_export.onnx")):
            self.path=path
            
        else:
            # print(os.path.dirname(__file__))
            self.path=os.path.dirname(__file__)
            
        # print("self.path",self.path)
        self.model_path=os.path.join(self.path,"model_troch_export.onnx")
            
        self.tokenizer=BertTokenizer.from_pretrained(self.path)
        self.loadModel()
        pass
[文档]    def loadModel(self):
        """[summary]
        
        加载训练的模型
        
        """
        
        self.ort_session = onnxruntime.InferenceSession(self.model_path)
        self.ort_session.get_providers()
        onnx_model = onnx.load(self.model_path)
        onnx.checker.check_model(onnx_model)
[文档]    def prediction(self,textLIst):
        """[summary]
        
        批量标注词性和分词

        Args:
            textLIst ([type]): [description]

        Returns:
            [type]: [description]
        """
        inputData = self.tokenizer(textLIst, padding="max_length",
                            max_length=128, truncation=True)
        # print("out",out)
        # compute ONNX Runtime output prediction

        token_type_ids = to_numpy(inputData["attention_mask"])
        # print()
        token_type_ids[token_type_ids > 0] = 1

        ort_inputs = {self.ort_session.get_inputs()[0].name: to_numpy(inputData["input_ids"]), self.ort_session.get_inputs(
        )[1].name: token_type_ids, self.ort_session.get_inputs()[2].name: to_numpy(inputData["attention_mask"])}

        # print(ort_inputs)
        ort_outs = self.ort_session.run(None, ort_inputs)
        return ort_outs, inputData
    
    
[文档]    def autoSeg(self,textLIst):
        """[summary]
        
        自动批量标注词性，分词，最大同时处理24条

        Args:
            textLIst ([type]): [description]

        Returns:
            [type]: [description]
        """
        datas = []
        orlen = len(textLIst)
        textLIst = (textLIst*24)[:24]
        ort_outs, inputData = self.prediction(textLIst)

        # onnx推理结果
        out = ort_outs[0]
        outType = ort_outs[1]

        for indexp, typeList, wd, attention_mask, text in zip(out.argmax(axis=-1), outType.argmax(axis=-1), inputData["input_ids"], inputData["attention_mask"], textLIst[:orlen]):
            # print(indexp,typeList,wd,attention_mask )
            one = []
            words = self.tokenizer.convert_ids_to_tokens(wd)
            # print(indexp)
            wordList=[]
            p=0
            for i, (pi, t, w, mask) in enumerate(zip(indexp.tolist(), typeList.tolist(), words, attention_mask)):
                
                if mask > 0:
                    
                    if i<p:
                        # print("漏掉")
                        continue
                        pass
                    else:
                        # print("漏掉")
                        pass
                    # print(i,i+pi,p)
                    # print(words[i:i+pi])

                    p=i+pi
                    

                    outword = []
                    for ww in words[i:i+pi]:
                        outword.append(ww.replace("##", '').replace(
                            "[PAD]", ' ').replace("[SEP]", ' \n'))

                    if len(outword) > 0:
                        one.append({"word": "".join(outword), "wtype": self.labels[t]})
                        wordList.append("".join(outword))
                elif mask == 0:
                    break
            datas.append(({"text": text, "pos": one,"seg":wordList}))
        return datas


# # 使用示例

# text = [" 张杨，男，汉族，黑龙江双城人，1988年2月6日生于贵州省贵阳市", " 自学习结合部分句法分析的汉语词性标注"]

# Seg=tkitSeg()
# datas = Seg.autoSeg(text)
# print(datas)