API Reference

`HBaseOps`

Bases: object

demo key = 'test' db = HBaseHelper(host=hbase_host) data = db.query_single_line(table='table', row_key=key) print(data)

Source code in src/nlpertools/data_client.py

class HBaseOps(object):
    """
    demo
    key = 'test'
    db = HBaseHelper(host=hbase_host)
    data = db.query_single_line(table='table', row_key=key)
    print(data)
    """

    def __init__(self, config=global_db_config["hbase"]):
        self.host = config["DEFAULT_HOST"]
        self.port = config["DEFAULT_PORT"]
        self.compat = config["DEFAULT_COMPAT"]
        self.table_prefix = None  # namespace
        self.transport = config["DEFAULT_TRANSPORT"]
        self.protocol = config["DEFAULT_PROTOCOL"]
        self.conn = self.connect()

    def connect(self):
        conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
                                    table_prefix=self.table_prefix, compat=self.compat,
                                    transport=self.transport, protocol=self.protocol)
        return conn

    def create_hb_table(self, table_name, **families):
        self.conn.create_table(table_name, families)

    def single_put(self, table_name, row_key, column, data):
        hb = happybase.Table(table_name, self.conn)
        hb.put(row_key,
               data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})

    def batch_put(self, table, row_key_name, column, datas, batch_size=1):
        hb = happybase.Table(table, self.conn)
        datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
        for x in datas_new:
            with hb.batch(batch_size=batch_size) as batch:
                for da in x:
                    da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
                    row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
                    batch.put(row_key, da_nw)
        return batch

    def single_put_self(self, table_name, row_keys, datas):
        hb = happybase.Table(table_name, self.conn)
        for row_key, (_, val) in zip(row_keys, datas.items()):
            hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
                             'maybe_table_name:maybe_column_name2': "%s" % val[1]})

    def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
                   filter=None):
        hb = happybase.Table(table, self.conn)
        scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
        hb_dict = dict(scan)
        if hb_dict:
            return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
                    for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def query_single_line(self, table, row_key):
        conn = self.connect()
        hb = happybase.Table(table, conn)
        hb_dict = hb.row(row_key)
        if hb_dict:
            return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
        else:
            return {}

    def query_multi_lines(self, table, row_keys):
        hb = happybase.Table(table, self.conn)
        hb_dict = dict(hb.rows(row_keys))
        if hb_dict:
            return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def single_delete(self, table, row_key):
        hb = happybase.Table(table, self.conn)
        hb.delete(row_key)

    def test_scan(self, table):
        hb = happybase.Table(table, self.conn)
        filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
        scan = hb.scan(limit=1000, filter=filter)

        hb_dict = dict(scan)
        if hb_dict:
            return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
                    for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def close(self):
        self.conn.close()

`MongoOps`

Bases: object

Source code in src/nlpertools/data_client.py

class MongoOps(object):
    def __init__(self, config=global_db_config["mongo"]):
        mongo_client = MongoClient(config["uri"])
        db = mongo_client[config["db"]]
        self.collection = db[config["col"]]

    def fetch_all(self):
        """
        读取所有数据
        :return:
        """
        ans = []
        print('提取所有数据.')
        for record in self.collection.find({}):
            record['_id'] = str(record['_id'])
            ans.append(record)
        return ans

    def load_from_mongo(self, special_value):
        """
        读取mongodb该special_value下所有值为special_value的数据
        :param
        :return:
        """
        record = self.collection.find({"{}".format(special_value): special_value})
        record = list(record)
        if not record:
            return None
        else:
            record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
            return record

    def delete_all(self):
        query = {}
        deleted = self.collection.delete_many(query)
        return deleted

    def delete_by_time(self, time):
        query = {"name": {"$regex": "^F"}}
        deleted = self.collection.delete_many(query)

    def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
        query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
        sort_sql = [("query_time", -1)]
        ans = []
        print('提取所有数据.')
        for record in self.collection.find(query).sort(sort_sql):
            record['_id'] = str(record['_id'])
            ans.append(record)
        return ans

    def save_to_mongo(self, special_value, each_item):
        """
        数据存入mongo
        :param special_value:
        :param each_item:
        :return:
        """
        query = self.collection.find({"{}".format(special_value): special_value})
        if list(query):
            self.collection.update_one({"{}".format(special_value): special_value},
                                       {"$push": {'each_item': each_item}})
        else:
            insert_item = {
                "special_value": special_value,
                "each_item": [each_item]
            }
            self.collection.insert_one(insert_item)
        print("update success")

    def insert_one(self, data):
        self.collection.insert_one(data)

    def update_to_mongo(self, condition_term, condition_value, new_value):
        """
        根据提供的字段和值，查询出对应的数据，更新数据存入mongo
        类似 updata
        :param condition_term: 条件字段term
        :param condition_value: 条件字段值
        :param new_value: 新的值。最好是dict，不是dict的话不知道行不行
        :return:
        """
        query = self.collection.find({condition_term: condition_value})
        if list(query):
            self.collection.update_one({condition_term: condition_value},
                                       {"$push": new_value})
        else:
            insert_item = {
                condition_term: condition_value,
                "processed_data": new_value
            }
            self.collection.insert_one(insert_item)
        print("update success")

`fetch_all()`

读取所有数据

Returns:

Type	Description

Source code in src/nlpertools/data_client.py

def fetch_all(self):
    """
    读取所有数据
    :return:
    """
    ans = []
    print('提取所有数据.')
    for record in self.collection.find({}):
        record['_id'] = str(record['_id'])
        ans.append(record)
    return ans

`load_from_mongo(special_value)`

读取mongodb该special_value下所有值为special_value的数据

Returns:

Type	Description

Source code in src/nlpertools/data_client.py

def load_from_mongo(self, special_value):
    """
    读取mongodb该special_value下所有值为special_value的数据
    :param
    :return:
    """
    record = self.collection.find({"{}".format(special_value): special_value})
    record = list(record)
    if not record:
        return None
    else:
        record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
        return record

`save_to_mongo(special_value, each_item)`

数据存入mongo

Parameters:

Name	Type	Description	Default
`special_value`			required
`each_item`			required

Returns:

Type	Description

Source code in src/nlpertools/data_client.py

def save_to_mongo(self, special_value, each_item):
    """
    数据存入mongo
    :param special_value:
    :param each_item:
    :return:
    """
    query = self.collection.find({"{}".format(special_value): special_value})
    if list(query):
        self.collection.update_one({"{}".format(special_value): special_value},
                                   {"$push": {'each_item': each_item}})
    else:
        insert_item = {
            "special_value": special_value,
            "each_item": [each_item]
        }
        self.collection.insert_one(insert_item)
    print("update success")

`update_to_mongo(condition_term, condition_value, new_value)`

根据提供的字段和值，查询出对应的数据，更新数据存入mongo 类似 updata

Parameters:

Name	Description	Default
`condition_term`	条件字段term	required
`condition_value`	条件字段值	required
`new_value`	新的值。最好是dict，不是dict的话不知道行不行	required

Returns:

Type	Description

Source code in src/nlpertools/data_client.py

def update_to_mongo(self, condition_term, condition_value, new_value):
    """
    根据提供的字段和值，查询出对应的数据，更新数据存入mongo
    类似 updata
    :param condition_term: 条件字段term
    :param condition_value: 条件字段值
    :param new_value: 新的值。最好是dict，不是dict的话不知道行不行
    :return:
    """
    query = self.collection.find({condition_term: condition_value})
    if list(query):
        self.collection.update_one({condition_term: condition_value},
                                   {"$push": new_value})
    else:
        insert_item = {
            condition_term: condition_value,
            "processed_data": new_value
        }
        self.collection.insert_one(insert_item)
    print("update success")

`CopyFunc`

Source code in src/nlpertools/dataprocess.py

class CopyFunc():
    # from https://github.com/lemon234071/clean-dialog
    def is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        return (
                (cp >= 0x4E00 and cp <= 0x9FFF)
                or (cp >= 0x3400 and cp <= 0x4DBF)  #
                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
                or (cp >= 0xF900 and cp <= 0xFAFF)
                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        )

    def contains_Chinese(seq):
        for char in seq:
            cp = ord(char)
            if is_chinese_char(cp):
                return True
        return False

`is_chinese_char(cp)`

Checks whether CP is the codepoint of a CJK character.

Source code in src/nlpertools/dataprocess.py

class CopyFunc():
    # from https://github.com/lemon234071/clean-dialog
    def is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        return (
                (cp >= 0x4E00 and cp <= 0x9FFF)
                or (cp >= 0x3400 and cp <= 0x4DBF)  #
                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
                or (cp >= 0xF900 and cp <= 0xFAFF)
                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        )

    def contains_Chinese(seq):
        for char in seq:
            cp = ord(char)
            if is_chinese_char(cp):
                return True
        return False

`Pattern`

pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1])) a = "󘯦asdasdas v啊实打实v阿松大" res = re.sub(pattern_special_char, "$",a)

Source code in src/nlpertools/dataprocess.py

class Pattern:
    """
    >>> pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1]))
        a = "\U000d8be6asdasdas \x00v啊实打实\x00\x00v阿松大\x00"
        res = re.sub(pattern_special_char, "$",a)
    """

    # some from data-prepare

    # emoji
    """
    # 这也是emoji的取法，不知道pattern全不全
    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
    """
    emoji_pattern = "[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]"

    # 特殊的乱码或不可见字符
    # \x 09:\t 0a:\n 0d:\r
    special_char_x_pattern = "[\x00-\x08\x0b\x0c\x0e\x0f\x10-\x19\x1a-\x1f]"
    # 统计大规模语料出来的非正常字符
    special_char_u_pattern = (
        "[\u3000\U000d8be6\U000e0062\U000e0063\U000e0067\U000e0073\U000e0074\U000e007f]"
    )
    special_char_pattern = "{}{}".format(
        special_char_x_pattern[1:-1], special_char_u_pattern[1:-1]
    )
    non_printing_characters_pattern = (
        f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
    )

    # 必须从头匹配，否则无意义的
    # 中文人名
    chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
    # 英文人名
    english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
    # 纯数字
    pure_num_pattern = "\d+"
    # xxxx图/表 之类的表述
    pic_table_descript_pattern = ".{1,15}图"

    # 无需从头匹配的。
    # hlink
    hlink_pattern = (
        r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
    )
    http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
    # 邮箱
    email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
    # html 可能过于严格了
    html_pattern = "<[\s\S]*?>"
    # 重复 “asdasdasdasd”
    repeat_pattern = "(.)\1+"
    # 日期
    day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
    # 小时
    hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
    # 股票
    stock_pattern = (
        "(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
    )

    # 一般是需要替换的
    # 多余空格 => " "
    redundancy_space_pattern = " +"
    # 一般用不到 多余换行符号 => " "
    linebreak_pattern = "[\r\n\t]+"

    # 微博视频等
    weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
    # @
    at_pattern = "@\w+"

    # from https://github.com/bigscience-workshop/data-preparation pii
    year_patterns = [
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-yyyy or yyyy/yyyy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # mm-yyyy or mm/yyyy or the same but with yy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-mm or yyyy/mm
    ]

    # Patterns for high-risk character strings
    id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
    # https://regex101.com/r/JQkmh8/2
    # key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
    # https://regex101.com/r/JQkmh8/5
    key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
    ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
    ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
    ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
        [ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"

    # https://regex101.com/r/EpA5B7/1
    email_line_pattern = r'''
        (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
        (
        [^\b\s@?!;,:)('"<]+
        @
        [^\b\s@!?;,/]*
        [^\b\s@?!;,/:)('">.]
        \.
        \p{L} \w{1,}
        )
        (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
    '''

    # https://regex101.com/r/mOqi1s/3
    # user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
    user_pattern = r'''
    (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
    (@
        [^)(\s@,?!;:'"]{3,}
    )
    '''

`TextProcess`

Bases: object

数据处理类这是基类，如果是定制化的语言处理，请继承该类

Source code in src/nlpertools/dataprocess.py

class TextProcess(object):
    """
    数据处理类
    这是基类，如果是定制化的语言处理，请继承该类
    """

    def __init__(
            self,
            patterns_filter: List = None,
            patterns_replace: List[List] = None,
            words_filter: List = []
    ):
        """
        pattern_list:
        """
        self.patterns_filter, self.patterns_replace = self._pre_complie_pattern(
            patterns_filter, patterns_replace
        )
        self.words_filter = words_filter

    @staticmethod
    def _pre_complie_pattern(patterns_filter, patterns_replace):
        complied_patterns_replace, complied_patterns_filter = [], []
        for i in patterns_filter:
            complied_patterns_filter.append(re.compile(i))
        for i in patterns_replace:
            complied_patterns_replace.append((re.compile(i[0]), i[1]))
        return complied_patterns_filter, complied_patterns_replace

    def process(self, text):
        # 进来的数据都要做的标准化
        text = self.full2half(text)
        # text = self.filter_http(text)
        text = self.filter_html(text)
        text = self.filter_html_special(text)
        # 根据类型与语言分别处理
        text = self.filter_exclusive(text)
        # text = self.trandition2simple(text)
        # text = self.remove_stopwords(text)
        return text

    def filter_words(self, text):
        # 根据词典，命中返回True，需要过滤掉

        for word in self.words_filter:
            if word in text:
                return True
        return False

    def filter_whitelist(self, text):
        whitelist = re.compile(
            "[^\u4e00-\u9fa5^0-9a-zA-Z^-^《^》^<^>^【^】^（^）^{^}^–^…^”^“^,^.^;^?^:^‘^~^`^，^。^？^；^！^：^、^·^!^@^#^$^%^&^(^)^|]"
        )
        text = whitelist.sub("", text)
        return text

    def text_split(self, text, language):
        if language == "en":
            text = text[:256]
        elif language == "zh":
            text = text[:510]
        return text

    def trandition2simple(self, text):
        # 仅对中文
        """
        https://juejin.cn/post/7234554420163100728
        """
        text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
        return text

    def remove_stopwords(self, text):
        new_tokens = []
        if self.language == "en":
            tokens = text.split(" ")
        else:
            tokens = jieba.lcut(text)

        for i in tokens:
            if i in self.stopwords:
                pass
            else:
                new_tokens.append(i)

        return new_tokens

    @staticmethod
    def split_sentence(sentence, language="chinese"):
        """
        分句，英文有nltk，中文怎么能没有好的分句工具呢
        :param sentence:
        :param language:
        :return:
        """
        # sentences->Str
        # example '12“345。”“6789”'
        assert language in ["chinese", "english"], "unsupportable for other language"
        if language == "chinese":
            split_signs = list("。！？…\t")
            other_sign = "”"
        elif language == "english":
            split_signs = list(".!?")
            other_sign = '"'
        else:
            split_signs = list(".!?")
            other_sign = '"'
        sentences = []
        start_idx = 0
        for idx, char in enumerate(sentence):
            if idx == len(sentence) - 1:
                if char in split_signs:
                    sentences.append(sentence[start_idx: idx + 1].strip())
                    start_idx = idx + 1
                else:
                    sentences.append(sentence[start_idx:].strip())
            else:
                if char in split_signs:
                    if sentence[idx + 1] == other_sign:
                        if idx < len(sentence) - 2:
                            # 处理。”。
                            if sentence[idx + 2] not in split_signs:
                                sentences.append(sentence[start_idx: idx + 2].strip())
                                start_idx = idx + 2
                    elif sentence[idx + 1] not in split_signs:
                        sentences.append(sentence[start_idx: idx + 1].strip())
                        start_idx = idx + 1
        sentences = [i.strip() for i in sentences if i.strip()]
        return sentences

    def cut_word(self, text, language):
        if language == "en":
            tokens = text.split(" ")
        else:
            tokens = jieba.lcut(text)
        return tokens

    def full2half(self, text):
        """
        全角转化为半角
        :param text:
        :return:
        """
        ret_str = ""
        for i in text:
            if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
                ret_str += chr(ord(i) - 65248)
            else:
                ret_str += i
        return ret_str

    def filter_html(self, text):
        # 这个比较严格
        """
        过滤html标签
        :param text:
        :return:
        """
        patterns = [
            re.compile("//<![CDATA[[^>]*//]]>", re.I),  # 匹配CDATA
            re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I),  # Script
            re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I),  # style
            re.compile("<brs*?/?>"),  # 处理换行
            re.compile("</?w+[^>]*>"),  # HTML标签
            re.compile("<!--[^>]*-->"),  # HTML注释
        ]
        for pattern in patterns:
            text = pattern.sub("", text)
        return text

    def filter_html_special(self, text):
        """
        替换所有html转义字符
        这个好像只有新闻有？
        :param text:
        :return:
        """
        # TODO html标签应该是 &nbsp 这种，\xa0也是吗
        CHAR_ENTITIES = {
            "&nbsp": " ",
            "160": " ",
            "lt": "<",
            "60": "<",
            "gt": ">",
            "62": ">",
            "amp": "&",
            "38": "&",
            "quot": '"',
            "34": '"',
            "ldquo": '"',
            "rdquo": '"',
            "mdash": "",
            "\xa0": "",
        }

        re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
        sz = re.search(re_charEntity, text)
        while sz:
            entity = sz.group()  # entity全称，如>
            key = sz.group("name")  # 去除&;后entity,如>为gt
            try:
                htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
                text = htmlstr
                sz = re.search(re_charEntity, htmlstr)
            except KeyError:
                # 以空串代替
                htmlstr = re_charEntity.sub("", text, 1)
                text = htmlstr
                sz = re_charEntity.search(htmlstr)
        return text

    def filter_exclusive(self, text):
        """
        去除 @、 #、 表情等twitter、微博“特有”的情况
        :return:
        """
        pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
        p = re.compile(pattern, re.S)
        text = p.sub("", text)

        dr = re.compile("@\w+", re.S)
        text = dr.sub("", text)

        return text

    def filter_html_tag(self, text):
        # res_tr = r'<a (.*?)></a>'
        # m_tr = re.findall(res_tr,text,re.S|re.M)
        res = re.sub(r"<a.*?>", "", text)
        res = re.sub(r"</a>", "", res)
        res = re.sub(r"<span.*?>", "", res)
        res = re.sub(r"</span>", "", res)
        res = re.sub(r"<img.*?>", "", res)
        res = re.sub(r"<br.*?>", "", res)
        res = re.sub(r"//", "", res)
        res = re.sub(r"@", "", res)
        res = re.sub(r"</", "", res)
        # res = re.sub(r',', '', res)
        # res = re.sub(r'&nbsp;', '', res)
        return res

    @staticmethod
    def uniform_whitespace(
            document,
            whitespace=[
                " ",
                " ",
                " ",
                " ",
                " ",
                "　",
                " ",
                " ",
                " ",
                " ",
                "￼",
                "",
            ],
    ):
        # from https://github.com/bigscience-workshop/data-preparation
        """There are different whitespace characters."""
        whitespace = set(whitespace)
        document = "".join(
            [char if char not in whitespace else " " for char in document]
        )
        return document

    def filter_pattern(self, text):
        """
        返回True表示命中规则，需要过滤
        """
        for pattern in self.patterns_filter:
            if re.match(pattern, text):
                return True
        return False

    def replace_pattern(self, text):
        for pattern, replace in self.patterns_replace:
            text = re.sub(pattern, replace, text)
        return text

`init(patterns_filter=None, patterns_replace=None, words_filter=[])`

pattern_list:

Source code in src/nlpertools/dataprocess.py

"""

def __init__(
        self,
        patterns_filter: List = None,
        patterns_replace: List[List] = None,
        words_filter: List = []
):
    """
    pattern_list:
    """
    self.patterns_filter, self.patterns_replace = self._pre_complie_pattern(
        patterns_filter, patterns_replace
    )
    self.words_filter = words_filter

`filter_exclusive(text)`

去除 @、 #、表情等twitter、微博“特有”的情况

Returns:

Type	Description

Source code in src/nlpertools/dataprocess.py

    return text

def filter_exclusive(self, text):
    """
    去除 @、 #、 表情等twitter、微博“特有”的情况
    :return:
    """
    pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
    p = re.compile(pattern, re.S)
    text = p.sub("", text)

    dr = re.compile("@\w+", re.S)
    text = dr.sub("", text)

    return text

`filter_html(text)`

过滤html标签

Parameters:

Name	Type	Description	Default
`text`			required

Returns:

Type	Description

Source code in src/nlpertools/dataprocess.py

    return ret_str

def filter_html(self, text):
    # 这个比较严格
    """
    过滤html标签
    :param text:
    :return:
    """
    patterns = [
        re.compile("//<![CDATA[[^>]*//]]>", re.I),  # 匹配CDATA
        re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I),  # Script
        re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I),  # style
        re.compile("<brs*?/?>"),  # 处理换行
        re.compile("</?w+[^>]*>"),  # HTML标签
        re.compile("<!--[^>]*-->"),  # HTML注释
    ]
    for pattern in patterns:
        text = pattern.sub("", text)
    return text

`filter_html_special(text)`

替换所有html转义字符这个好像只有新闻有？

Parameters:

Name	Type	Description	Default
`text`			required

Returns:

Type	Description

Source code in src/nlpertools/dataprocess.py

    return text

def filter_html_special(self, text):
    """
    替换所有html转义字符
    这个好像只有新闻有？
    :param text:
    :return:
    """
    # TODO html标签应该是 &nbsp 这种，\xa0也是吗
    CHAR_ENTITIES = {
        "&nbsp": " ",
        "160": " ",
        "lt": "<",
        "60": "<",
        "gt": ">",
        "62": ">",
        "amp": "&",
        "38": "&",
        "quot": '"',
        "34": '"',
        "ldquo": '"',
        "rdquo": '"',
        "mdash": "",
        "\xa0": "",
    }

    re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
    sz = re.search(re_charEntity, text)
    while sz:
        entity = sz.group()  # entity全称，如>
        key = sz.group("name")  # 去除&;后entity,如>为gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
            text = htmlstr
            sz = re.search(re_charEntity, htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub("", text, 1)
            text = htmlstr
            sz = re_charEntity.search(htmlstr)
    return text

`filter_pattern(text)`

返回True表示命中规则，需要过滤

Source code in src/nlpertools/dataprocess.py

    return document

def filter_pattern(self, text):
    """
    返回True表示命中规则，需要过滤
    """
    for pattern in self.patterns_filter:
        if re.match(pattern, text):
            return True
    return False

`full2half(text)`

全角转化为半角

Parameters:

Name	Type	Description	Default
`text`			required

Returns:

Type	Description

Source code in src/nlpertools/dataprocess.py

    return tokens

def full2half(self, text):
    """
    全角转化为半角
    :param text:
    :return:
    """
    ret_str = ""
    for i in text:
        if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
            ret_str += chr(ord(i) - 65248)
        else:
            ret_str += i
    return ret_str

`split_sentence(sentence, language='chinese')` `staticmethod`

分句，英文有nltk，中文怎么能没有好的分句工具呢

Parameters:

Name	Type	Description	Default
`sentence`			required
`language`			`'chinese'`

Returns:

Type	Description

Source code in src/nlpertools/dataprocess.py

    return new_tokens

@staticmethod
def split_sentence(sentence, language="chinese"):
    """
    分句，英文有nltk，中文怎么能没有好的分句工具呢
    :param sentence:
    :param language:
    :return:
    """
    # sentences->Str
    # example '12“345。”“6789”'
    assert language in ["chinese", "english"], "unsupportable for other language"
    if language == "chinese":
        split_signs = list("。！？…\t")
        other_sign = "”"
    elif language == "english":
        split_signs = list(".!?")
        other_sign = '"'
    else:
        split_signs = list(".!?")
        other_sign = '"'
    sentences = []
    start_idx = 0
    for idx, char in enumerate(sentence):
        if idx == len(sentence) - 1:
            if char in split_signs:
                sentences.append(sentence[start_idx: idx + 1].strip())
                start_idx = idx + 1
            else:
                sentences.append(sentence[start_idx:].strip())
        else:
            if char in split_signs:
                if sentence[idx + 1] == other_sign:
                    if idx < len(sentence) - 2:
                        # 处理。”。
                        if sentence[idx + 2] not in split_signs:
                            sentences.append(sentence[start_idx: idx + 2].strip())
                            start_idx = idx + 2
                elif sentence[idx + 1] not in split_signs:
                    sentences.append(sentence[start_idx: idx + 1].strip())
                    start_idx = idx + 1
    sentences = [i.strip() for i in sentences if i.strip()]
    return sentences

`trandition2simple(text)`

https://juejin.cn/post/7234554420163100728

Source code in src/nlpertools/dataprocess.py

    return text

def trandition2simple(self, text):
    # 仅对中文
    """
    https://juejin.cn/post/7234554420163100728
    """
    text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
    return text

`uniform_whitespace(document, whitespace=[' ', '\u2009', '\u200a', '\u202f', '\u2005', '\u3000', '\u2002', '\xa0', '\u2008', '\u2003', '', '\x84'])` `staticmethod`

There are different whitespace characters.

Source code in src/nlpertools/dataprocess.py

    return res

@staticmethod
def uniform_whitespace(
        document,
        whitespace=[
            " ",
            " ",
            " ",
            " ",
            " ",
            "　",
            " ",
            " ",
            " ",
            " ",
            "￼",
            "",
        ],
):
    # from https://github.com/bigscience-workshop/data-preparation
    """There are different whitespace characters."""
    whitespace = set(whitespace)
    document = "".join(
        [char if char not in whitespace else " " for char in document]
    )
    return document

`DataAnalysis`

Source code in src/nlpertools/ml.py

class DataAnalysis:
    @staticmethod
    def draw_pic(df, save_path):
        """
        画直方图，对比两个不同类别差异
        :param df: pd.DataFrame
        :param save_path: str
        :return: 
        """
        sns.distplot(df[df["label"] == 1]["feature"], label="label1")
        sns.distplot(df[df["label"] == 0]["feature"], label="label2")
        plt.legend()
        plt.savefig(save_path)

`draw_pic(df, save_path)` `staticmethod`

画直方图，对比两个不同类别差异

Parameters:

Name	Type	Description	Default
`df`		pd.DataFrame	required
`save_path`		str	required

Returns:

Type	Description

Source code in src/nlpertools/ml.py

@staticmethod
def draw_pic(df, save_path):
    """
    画直方图，对比两个不同类别差异
    :param df: pd.DataFrame
    :param save_path: str
    :return: 
    """
    sns.distplot(df[df["label"] == 1]["feature"], label="label1")
    sns.distplot(df[df["label"] == 0]["feature"], label="label2")
    plt.legend()
    plt.savefig(save_path)

`STEM`

Bases: object

Source code in src/nlpertools/ml.py

class STEM(object):

    def __init__(self, IPT_MODEL_PATH):
        self.ltp = LTP(IPT_MODEL_PATH)

    def start_by_dep(self, sentence):
        seg, hidden = self.ltp.seg([sentence])
        dep = self.ltp.dep(hidden)  # , graph=False)
        seg, dep = seg[0], dep[0]
        for i in dep:
            # 主谓宾
            if 'SBV' == i[2]:
                subject = seg[i[0]]
                verb = seg[i[1]]
            if 'VOB' in i[2]:
                if seg[i[1]] == verb:
                    object = seg[i[0]]

                return subject

        return None

    def start_by_srl(self, sentence):
        """
        用语义角色标注工具
        :param sentence: "他叫汤姆去拿外衣。"
        :return:  events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]
        """
        # 语义角色标注方法
        seg, hidden = self.ltp.seg([sentence])
        srl = self.ltp.srl(hidden)
        seg, srl = seg[0], srl[0]
        events = []
        for wdx, each_srl in enumerate(srl):
            if each_srl:
                args = []
                for arg in each_srl:
                    args.extend(seg[arg[1]:arg[2] + 1])
                # 添加上谓词
                args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
                events.append(args)
        # print(events)
        return events

`start_by_srl(sentence)`

用语义角色标注工具

Parameters:

Name	Type	Description	Default
`sentence`		"他叫汤姆去拿外衣。"	required

Returns:

Type	Description
	events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]

Source code in src/nlpertools/ml.py

def start_by_srl(self, sentence):
    """
    用语义角色标注工具
    :param sentence: "他叫汤姆去拿外衣。"
    :return:  events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]
    """
    # 语义角色标注方法
    seg, hidden = self.ltp.seg([sentence])
    srl = self.ltp.srl(hidden)
    seg, srl = seg[0], srl[0]
    events = []
    for wdx, each_srl in enumerate(srl):
        if each_srl:
            args = []
            for arg in each_srl:
                args.extend(seg[arg[1]:arg[2] + 1])
            # 添加上谓词
            args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
            events.append(args)
    # print(events)
    return events

`convert_crf_format_10_fold(corpus, objdir_path)`

把已经是crf格式的数据，分成十折。 para:

Source code in src/nlpertools/ml.py

def convert_crf_format_10_fold(corpus, objdir_path):
    '''
    把已经是crf格式的数据，分成十折。
    para:

    '''
    # corpus = list(range(1,22))
    j_mkdir(objdir_path)
    split_position = int(len(corpus) / 10)
    for k in range(0, 10):
        if k == 9:
            dev_set = corpus[k * split_position:]
            train_set = corpus[:k * split_position]
        else:
            dev_set = corpus[k * split_position: (k + 1) * split_position]
            train_set = corpus[:k * split_position] + corpus[(k + 1) * split_position:]
        writetxt_w_list(train_set, os.path.join(objdir_path, 'train{}.txt'.format(k + 1)))
        writetxt_w_list(dev_set, os.path.join(objdir_path, 'test{}.txt'.format(k + 1)))
        writetxt_w_list(dev_set, os.path.join(objdir_path, 'dev{}.txt'.format(k + 1)))

`kfold(corpus, path, k=9, is_shuffle=True)`

k是10份中训练集占了几份

Source code in src/nlpertools/ml.py

def kfold(corpus, path, k=9, is_shuffle=True):
    '''
    k是10份中训练集占了几份
    '''
    j_mkdir(path)
    if is_shuffle:
        random.shuffle(corpus)
    split_position = int(len(corpus) / 10)
    train_set, dev_set = corpus[:k * split_position], corpus[k * split_position:]
    writetxt_w_list(train_set, os.path.join(path, 'train.tsv'), num_lf=1)
    writetxt_w_list(dev_set, os.path.join(path, 'test.tsv'), num_lf=1)
    writetxt_w_list(dev_set, os.path.join(path, 'dev.tsv'), num_lf=1)
    """
    import pandas as pd
    from sklearn.model_selection import KFold

    df = pd.DataFrame({
        "text": ["text_{}".format(i) for i in range(100)],
        "labels": ["label_{}".format(i % 10) for i in range(100)]
    })
    train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
    df_test_and_val = df.iloc[test_and_val_idx]
    test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
    df_train = df.iloc[train_idx]
    df_val = df.iloc[val_idx]
    df_test = df.iloc[test_idx]
    print(train_idx)
    print(val_idx)
    print(test_idx)
    """

`label(text, labels)`

返回两列的标记数据序列

Parameters:

Name	Type	Description	Default
`text`			required
`labels`			required

Returns:

Type	Description

Source code in src/nlpertools/ml.py

def label(text, labels):
    '''
    返回两列的标记数据序列
    :param text:
    :param labels:
    :return:
    '''
    train_sequence = '\n'.join(
        ['\t'.join(i) if i[0] != ' ' else '[null]\t{}'.format(i[1]) for i in zip(list(text), labels)])
    return train_sequence

`read_seq_res(path, labels)`

读序列标注三列数据的方法

Parameters:

Name	Type	Description	Default
`path`			required
`labels`			required

Returns:

Type	Description

Source code in src/nlpertools/ml.py

def read_seq_res(path, labels):
    '''
    读序列标注三列数据的方法
    :param path:
    :param labels:
    :return:
    '''
    with codecs.open(path, 'r', 'utf-8') as rd:
        seqs_str = rd.read().strip()
    seqs_list = seqs_str.split('\n\n')
    text, raw_label, predict_label = [], [], []
    for seq in seqs_list:
        seq_split = seq.split('\n')
        text_tmp = ''
        raw_index_dict, pre_index_dict = {}, {}
        for label in labels:
            raw_index_dict.setdefault(label, [])
            pre_index_dict.setdefault(label, [])
        for idx, line in enumerate(seq_split):
            tmp = line.split('\t')
            text_tmp += tmp[0]
            if tmp[1] in labels:
                raw_index_dict[tmp[1]].append(idx)
            if tmp[2] in labels:
                pre_index_dict[tmp[2]].append(idx)
        text.append(text_tmp)
        raw_label.append(raw_index_dict)
        predict_label.append(pre_index_dict)
    return text, raw_label, predict_label

`seed_everything(seed=7777777)`

设置整个开发环境的seed

Parameters:

Name	Type	Description	Default
`seed`			`7777777`
`device`			required

Returns:

Type	Description
`None`

Source code in src/nlpertools/ml.py

def seed_everything(seed=7777777) -> None:
    """
    设置整个开发环境的seed
    :param seed:
    :param device:
    :return:
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)  # CPU随机种子确定
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

`split_sentence(sentence, language='chinese', cross_line=True)`

分句，英文有nltk，中文怎么能没有好的分句工具呢

Parameters:

Name	Type	Description	Default
`sentence`			required
`language`			`'chinese'`
`cross_line`			`True`

Returns:

Type	Description

Source code in src/nlpertools/ml.py

def split_sentence(sentence, language='chinese', cross_line=True):
    """
    分句，英文有nltk，中文怎么能没有好的分句工具呢
    :param sentence:
    :param language:
    :param cross_line:
    :return:
    """
    # sentences->Str
    # example '12“345。”“6789”'
    assert language in ["chinese", "english"], "unsupportable for other language"
    sentence = sentence.replace("\r", "")
    if language == 'chinese':
        split_signs = list('。！？…')
        if cross_line:
            split_signs.append("\n")
        other_sign = "”"
    elif language == 'english':
        split_signs = list('.!?')
        other_sign = '"'
    else:
        split_signs = list('.!?')
        other_sign = '"'
    sentences = []
    start_idx = 0
    for idx, char in enumerate(sentence):
        if idx == len(sentence) - 1:
            if char in split_signs:
                sentences.append(sentence[start_idx:idx + 1].strip())
                start_idx = idx + 1
            else:
                sentences.append(sentence[start_idx:].strip())
        else:
            if char in split_signs:
                if sentence[idx + 1] == other_sign:
                    if idx < len(sentence) - 2:
                        # 处理。”。
                        if sentence[idx + 2] not in split_signs:
                            sentences.append(sentence[start_idx:idx + 2].strip())
                            start_idx = idx + 2
                elif sentence[idx + 1] not in split_signs:
                    sentences.append(sentence[start_idx:idx + 1].strip())
                    start_idx = idx + 1
    return sentences

`subject_object_labeling(spo_list, text)`

百度那种有spo字典的数据，给标成。草，看不懂，得找找哪里用的

Parameters:

Name	Type	Description	Default
`spo_list`			required
`text`			required

Returns:

Type	Description
	labeling_list

Source code in src/nlpertools/ml.py

def subject_object_labeling(spo_list, text):
    # TODO
    '''
    百度那种有spo字典的数据，给标成。草，看不懂，得找找哪里用的
    :param spo_list:
    :param text:
    :return: labeling_list
    '''

    def _spo_list_to_spo_predicate_dict(spo_list):
        spo_predicate_dict = dict()
        for spo_item in spo_list:
            predicate = spo_item["predicate"]
            subject = spo_item["subject"]
            object = spo_item["object"]
            spo_predicate_dict.setdefault(predicate, []).append((subject, object))
        return spo_predicate_dict

    def _index_q_list_in_k_list(q_list, k_list):
        """Known q_list in k_list, find index(first time) of q_list in k_list"""
        q_list_length = len(q_list)
        k_list_length = len(k_list)
        for idx in range(k_list_length - q_list_length + 1):
            t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
            # print(idx, t)
            if all(t):
                # print(idx)
                idx_start = idx
                return idx_start

    def _labeling_type(spo, spo_type):
        idx_start = _index_q_list_in_k_list(q_list=spo, k_list=text)
        labeling_list[idx_start] = 'B-' + spo_type
        if len(spo) == 2:
            labeling_list[idx_start + 1] = 'I-' + spo_type
        elif len(spo) >= 3:
            labeling_list[idx_start + 1: idx_start + len(spo)] = ['I-' + spo_type] * (len(spo) - 1)
        else:
            pass

    spo_predicate_dict = _spo_list_to_spo_predicate_dict(spo_list)
    labeling_list = ['O'] * len(text)
    # count = 0
    for predicate, spo_list_form in spo_predicate_dict.items():
        if predicate in text:
            for (spo_subject, spo_object) in spo_list_form:
                # if predicate not in spo_subject and predicate not in spo_object:
                _labeling_type(spo_subject, 'SUB')
                _labeling_type(spo_object, 'OBJ')
                _labeling_type(predicate, 'PRE')
                # count += 1
                # print(count)
                # if count == 2:
                #     print()
            if labeling_list != ['O'] * len(text):
                return labeling_list
    return None

`GaussDecay`

Bases: object

当前只实现了时间的，全部使用默认值

Source code in src/nlpertools/other.py

class GaussDecay(object):
    """
    当前只实现了时间的，全部使用默认值
    """

    def __init__(self, origin='2022-08-02', scale='90d', offset='5d', decay=0.5, task="time"):
        self.origin = origin
        self.task = task
        self.scale, self.offset = self.translate(scale, offset)
        self.decay = decay
        self.time_coefficient = 0.6
        self.related_coefficient = 0.4

    def translate(self, scale, offset):
        """
        将领域的输入转化为标准
        :return:
        """
        if self.task == "time":
            scale = 180
            offset = 5
        else:
            scale = 180
            offset = 5
        return scale, offset

    @staticmethod
    def translated_minus(field_value):
        origin = datetime.datetime.now()
        field_value = datetime.datetime.strptime(field_value, '%Y-%m-%d %H:%M:%S')
        return (origin - field_value).days

    def calc_exp(self):
        pass

    def calc_liner(self):
        pass

    def calc_gauss(self, raw_score, field_value):
        """
        $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
        $$σ^2=-scale^2/(2·ln(decay))$$
        :param raw_score:
        :param field_value:
        :return:
        """
        numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
        sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
        denominator = 2 * sigma_square
        s = math.exp(-1 * numerator / denominator)
        return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)

`calc_gauss(raw_score, field_value)`

$$S(doc)=exp(-rac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ - $$σ^2=-scale^2/(2·ln(decay))$$

Parameters:

Name	Type	Description	Default
`raw_score`			required
`field_value`			required

Returns:

Type	Description

Source code in src/nlpertools/other.py

def calc_gauss(self, raw_score, field_value):
    """
    $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
    $$σ^2=-scale^2/(2·ln(decay))$$
    :param raw_score:
    :param field_value:
    :return:
    """
    numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
    sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
    denominator = 2 * sigma_square
    s = math.exp(-1 * numerator / denominator)
    return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)

`translate(scale, offset)`

将领域的输入转化为标准

Returns:

Type	Description

Source code in src/nlpertools/other.py

def translate(self, scale, offset):
    """
    将领域的输入转化为标准
    :return:
    """
    if self.task == "time":
        scale = 180
        offset = 5
    else:
        scale = 180
        offset = 5
    return scale, offset

`auto_close()`

针对企业微信15分钟会显示离开的机制，假装自己还在上班

Source code in src/nlpertools/other.py

def auto_close():
    """
    针对企业微信15分钟会显示离开的机制，假装自己还在上班
    """
    import pyautogui as pg
    import time
    import os
    cmd = 'schtasks /create /tn shut /tr "shutdown -s -f" /sc once /st 23:30'
    os.system(cmd)
    while 1:
        pg.moveTo(970, 17, 2)
        pg.click()
        time.sleep(840)

`camel_to_snake(s)`

将 camel case 转换到 snake case.

Parameters:

Name	Type	Description	Default
`s`	`str`	camel case variable	required

Returns:

Type	Description
`str`

Source code in src/nlpertools/other.py

def camel_to_snake(s: str) -> str:
    """
    将 camel case 转换到 snake case.
    :param s: camel case variable
    :return:
    """
    return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()

`git_push()`

针对国内提交github经常失败，自动提交

Source code in src/nlpertools/other.py

def git_push():
    """
    针对国内提交github经常失败，自动提交
    """
    num = -1
    while 1:
        num += 1
        print("retry num: {}".format(num))
        res = os.system("git push --set-upstream origin main")
        print(str(res))
        if not str(res).startswith("fatal"):
            print("scucess")
            break

`snake_to_camel(s)`

author: u 将 snake case 转换到 camel case.

Parameters:

Name	Type	Description	Default
`s`	`str`	snake case variable	required

Returns:

Type	Description
`str`

Source code in src/nlpertools/other.py

def snake_to_camel(s: str) -> str:
    """
    author: u
    将 snake case 转换到 camel case.
    :param s: snake case variable
    :return:
    """
    return s.title().replace("_", "")

`spider(url)`

Parameters:

Name	Type	Description	Default
`url`			required

Returns:

Type	Description

Source code in src/nlpertools/other.py

def spider(url):
    """

    :param url:
    :return:
    """
    if 'baijiahao' in url:
        content = requests.get(url)
        # print(content.text)
        html = pq.PyQuery(content.text)
        title = html('.index-module_articleTitle_28fPT').text()
        res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
        return '{}\n{}'.format(title, res)

`EmailClient`

Bases: object

Source code in src/nlpertools/plugin.py

class EmailClient(object):
    def __init__(self):
        self.mail_user = ""
        self.mail_pass = ""
        self.receiver = ""

    def sent_email(self, title, content):
        """
        # mail_user = 'xxx'
        # mail_pass = 'xxx'
        # receiver = 'xxx'
        # sent_email(mail_user, mail_pass, receiver)
        """

        # log info
        mail_host = 'smtp.qq.com'
        mail_user = self.mail_user
        mail_pass = self.mail_pass
        sender = mail_user

        # email info
        message = MIMEText(content, 'plain', 'utf-8')
        message['Subject'] = title
        message['From'] = sender
        message['To'] = self.receiver

        # log and send
        try:
            smtpObj = smtplib.SMTP()
            smtpObj.connect(mail_host, 25)
            smtpObj.login(mail_user, mail_pass)
            smtpObj.sendmail(sender, self.receiver, message.as_string())
            smtpObj.quit()
            print('send email succes')
        except smtplib.SMTPException as e:
            print('erro', e)

`sent_email(title, content)`

mail_user = 'xxx'

mail_pass = 'xxx'

receiver = 'xxx'

sent_email(mail_user, mail_pass, receiver)

Source code in src/nlpertools/plugin.py

def sent_email(self, title, content):
    """
    # mail_user = 'xxx'
    # mail_pass = 'xxx'
    # receiver = 'xxx'
    # sent_email(mail_user, mail_pass, receiver)
    """

    # log info
    mail_host = 'smtp.qq.com'
    mail_user = self.mail_user
    mail_pass = self.mail_pass
    sender = mail_user

    # email info
    message = MIMEText(content, 'plain', 'utf-8')
    message['Subject'] = title
    message['From'] = sender
    message['To'] = self.receiver

    # log and send
    try:
        smtpObj = smtplib.SMTP()
        smtpObj.connect(mail_host, 25)
        smtpObj.login(mail_user, mail_pass)
        smtpObj.sendmail(sender, self.receiver, message.as_string())
        smtpObj.quit()
        print('send email succes')
    except smtplib.SMTPException as e:
        print('erro', e)

`convert_import_string_to_import_list(text)`

该方法将 import 转变为 try import

Source code in src/nlpertools/utils_for_nlpertools.py

def convert_import_string_to_import_list(text):
    """
    该方法将 import 转变为 try import
    """
    models_to_import = []
    import_list = text.split("\n")
    for each in import_list:
        print(each)
        name, package, as_name = None, None, None
        elements = each.split(" ")
        for pre, cur in zip(elements, elements[1:]):
            if cur.endswith(","):
                cur = cur.rstrip(",")
            # 为了实现from import 和 import统一，首先把package和name的含义反过来，后面再掉换
            if pre == "import":
                package = cur
            if pre == "from":
                name = cur
            if pre == "as":
                as_name = cur
            if pre[-1] == ",":
                # 针对 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
                # 将将前面部分和当前的组成新字段
                prefix = each.split("import")[0]
                import_list.append("{}import {}".format(prefix, cur))
        if not as_name:
            as_name = package.split(".")[-1]
        if not name:
            name, package = package, name
        models_to_import.append((name, package, as_name))
    # 打印
    all_import_info = ["", "from utils_for_nlpertools import try_import", ""]
    for name, package, as_name in models_to_import:
        import_info = '{} = try_import("{}", {})'.format(as_name, name, '"{}"'.format(package) if package else package)
        all_import_info.append(import_info)
        print(import_info)
    return all_import_info

`fn_async_timer(function)`

针对异步函数的装饰器

Source code in src/nlpertools/wrapper.py

def fn_async_timer(function):
    """
    针对异步函数的装饰器
    """
    @wraps(function)
    async def function_timer(*args, **kwargs):
        t0 = time.time()
        result = await function(*args, **kwargs)
        t1 = time.time()
        print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
        return result

    return function_timer

`fn_timeout_checker(wait_time, callback)`

超时判断的装饰器两个包，使用gevent出现bug

Source code in src/nlpertools/wrapper.py

def fn_timeout_checker(wait_time, callback):
    """
    超时判断的装饰器
    两个包，使用gevent出现bug
    """
    # from gevent import Timeout
    # from gevent.monkey import patch_all

    # patch_all() # thread=False加了这个参数，配合flask app的threaded=True,会报错，目前还没有理解阻塞，线程之间的关系。不加即thread=True时没问题

    from eventlet import Timeout
    from eventlet import monkey_patch

    monkey_patch(time=True)

    def wrapper(func):
        def inner(*args, **kwargs):
            finish_flag = False
            with Timeout(wait_time, False):
                res = func(*args, **kwargs)
                finish_flag = True
            if not finish_flag:
                res = callback()
            return res

        return inner

    return wrapper

`fn_timer(async_func=False, analyse=False)`

@fn_timer() def example(): time.sleep(2)

Parameters:

Name	Type	Description	Default
`analyse`			`False`

Returns:

Type	Description

Source code in src/nlpertools/wrapper.py

def fn_timer(async_func=False, analyse=False):
    """
    >>> @fn_timer()
    >>> def example():
    >>>     time.sleep(2)
    :param analyse:
    :return:
    """

    def wrapper(func):
        async def func_time_async(*args, **kwargs):
            t0 = time.time()
            result = await asyncio.create_task(func(*args, **kwargs))
            t1 = time.time()
            print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
            return result

        def func_time(*args, **kwargs):
            t0 = time.time()
            result = func(*args, **kwargs)
            t1 = time.time()
            print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
            return result

        def func_time_analyse(*args, **kwargs):
            from pyinstrument import Profiler

            profiler = Profiler()
            profiler.start()

            result = func(*args, **kwargs)

            profiler.stop()
            profiler.print()
            return result

        if async_func is True:
            return func_time_async
        else:
            if analyse:
                return func_time_analyse
            else:
                return func_time

    return wrapper

`fn_try(parameter)`

该函数把try...catch...封装成装饰器，接收一个字典参数，并把其中的msg字段改为具体报错信息

Parameters:

Name	Type	Description	Default
`parameter`		{"msg": "", etc.}	required

Returns:

Type	Description
	parameter: {"msg": 内容填充为具体的报错信息, etc.}

Source code in src/nlpertools/wrapper.py

def fn_try(parameter):
    """
    该函数把try...catch...封装成装饰器，
    接收一个字典参数，并把其中的msg字段改为具体报错信息
    :param parameter: {"msg": "", etc.}
    :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
    """

    def wrapper(function):
        def inner(*args, **kwargs):
            try:
                result = function(*args, **kwargs)
                return result
            except Exception as e:
                msg = "报错！"
                print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
                parameter["msg"] = parameter["msg"].format(str(e))
                return parameter
            finally:
                pass

        return inner

    return wrapper

API Reference

HBaseOps

MongoOps

fetch_all()

load_from_mongo(special_value)

save_to_mongo(special_value, each_item)

update_to_mongo(condition_term, condition_value, new_value)

CopyFunc

is_chinese_char(cp)

Pattern

TextProcess

__init__(patterns_filter=None, patterns_replace=None, words_filter=[])

filter_exclusive(text)

filter_html(text)

filter_html_special(text)

filter_pattern(text)

full2half(text)

split_sentence(sentence, language='chinese') staticmethod

trandition2simple(text)

uniform_whitespace(document, whitespace=[' ', '\u2009', '\u200a', '\u202f', '\u2005', '\u3000', '\u2002', '\xa0', '\u2008', '\u2003', '￼', '\x84']) staticmethod

DataAnalysis

draw_pic(df, save_path) staticmethod

STEM

start_by_srl(sentence)

convert_crf_format_10_fold(corpus, objdir_path)

kfold(corpus, path, k=9, is_shuffle=True)

label(text, labels)

read_seq_res(path, labels)

seed_everything(seed=7777777)

split_sentence(sentence, language='chinese', cross_line=True)

subject_object_labeling(spo_list, text)

GaussDecay

calc_gauss(raw_score, field_value)

translate(scale, offset)

auto_close()

camel_to_snake(s)

git_push()

snake_to_camel(s)

spider(url)

EmailClient

sent_email(title, content)

mail_user = 'xxx'

mail_pass = 'xxx'

receiver = 'xxx'

sent_email(mail_user, mail_pass, receiver)

convert_import_string_to_import_list(text)

fn_async_timer(function)

fn_timeout_checker(wait_time, callback)

fn_timer(async_func=False, analyse=False)

fn_try(parameter)

`HBaseOps`

`MongoOps`

`fetch_all()`

`load_from_mongo(special_value)`

`save_to_mongo(special_value, each_item)`

`update_to_mongo(condition_term, condition_value, new_value)`

`CopyFunc`

`is_chinese_char(cp)`

`Pattern`

`TextProcess`

`init(patterns_filter=None, patterns_replace=None, words_filter=[])`

`filter_exclusive(text)`

`filter_html(text)`

`filter_html_special(text)`

`filter_pattern(text)`

`full2half(text)`

`split_sentence(sentence, language='chinese')` `staticmethod`

`trandition2simple(text)`

`uniform_whitespace(document, whitespace=[' ', '\u2009', '\u200a', '\u202f', '\u2005', '\u3000', '\u2002', '\xa0', '\u2008', '\u2003', '', '\x84'])` `staticmethod`

`DataAnalysis`

`draw_pic(df, save_path)` `staticmethod`

`STEM`

`start_by_srl(sentence)`

`convert_crf_format_10_fold(corpus, objdir_path)`

`kfold(corpus, path, k=9, is_shuffle=True)`

`label(text, labels)`

`read_seq_res(path, labels)`

`seed_everything(seed=7777777)`

`split_sentence(sentence, language='chinese', cross_line=True)`

`subject_object_labeling(spo_list, text)`

`GaussDecay`

`calc_gauss(raw_score, field_value)`

`translate(scale, offset)`

`auto_close()`

`camel_to_snake(s)`

`git_push()`

`snake_to_camel(s)`

`spider(url)`

`EmailClient`

`sent_email(title, content)`

`convert_import_string_to_import_list(text)`

`fn_async_timer(function)`

`fn_timeout_checker(wait_time, callback)`

`fn_timer(async_func=False, analyse=False)`

`fn_try(parameter)`