AC自动机算法敏感词


# -*- coding:utf-8 -*-

import time
import re
time1=time.time()

# AC自动机算法
class node(object):
    def __init__(self):
        self.next = {}
        self.fail = None
        self.isWord = False
        self.word = ""

class ac_automation(object):

    def __init__(self):
        self.root = node()


    def clean_zh_text(self, text):
        # keep English, digital and Chinese
        comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
        return comp.sub('', text)

    # 添加敏感词函数
    def addword(self, word):
        temp_root = self.root
        for char in word:
            if char not in temp_root.next:
                temp_root.next[char] = node()
            temp_root = temp_root.next[char]
        temp_root.isWord = True
        temp_root.word = word

    # 失败指针函数
    def make_fail(self):
        temp_que = []
        temp_que.append(self.root)
        while len(temp_que) != 0:
            temp = temp_que.pop(0)
            p = None
            for key,value in temp.next.item():
                if temp == self.root:
                    temp.next[key].fail = self.root
                else:
                    p = temp.fail
                    while p is not None:
                        if key in p.next:
                            temp.next[key].fail = p.fail
                            break
                        p = p.fail
                    if p is None:
                        temp.next[key].fail = self.root
                temp_que.append(temp.next[key])

    # 查找敏感词函数
    def search(self, content):
        p = self.root
        result = []
        currentposition = 0

        while currentposition < len(content):
            word = content[currentposition]
            while word in p.next == False and p != self.root:
                p = p.fail

            if word in p.next:
                p = p.next[word]
            else:
                p = self.root

            if p.isWord:
                result.append(p.word)
                p = self.root
            currentposition += 1
        return result

    # 加载敏感词库函数
    def parse(self, path):
        with open(path,encoding='utf-8') as f:
            for keyword in f:
                self.addword(str(keyword).strip())

    # 敏感词替换函数
    def words_replace(self, text):
        """
        :param ah: AC自动机
        :param text: 文本
        :return: 过滤敏感词之后的文本
        """
        result = list(set(self.search(text)))
        if result:
            print(result,"是非法字符")
        for x in result:
            m = text.replace(x, 'OpenVINO ' * len(x))
            text = m
        return text





if __name__ == '__main__':
    ah = ac_automation()
    path='sensitive_words.txt'
    ah.parse(path)
    text1="新-疆-骚-乱-苹果新品发布会"
    text1 = ah.clean_zh_text(text1)
    text2=ah.words_replace(text1)

    print(text1)
    print(text2)

    time2 = time.time()
    print('总共耗时:' + str(time2 - time1) + 's')