API Reference

HBaseOps

Bases: object

demo key = 'test' db = HBaseHelper(host=hbase_host) data = db.query_single_line(table='table', row_key=key) print(data)

Source code in src/nlpertools/data_client.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
class HBaseOps(object):
    """
    demo
    key = 'test'
    db = HBaseHelper(host=hbase_host)
    data = db.query_single_line(table='table', row_key=key)
    print(data)
    """

    def __init__(self, config=global_db_config["hbase"]):
        self.host = config["DEFAULT_HOST"]
        self.port = config["DEFAULT_PORT"]
        self.compat = config["DEFAULT_COMPAT"]
        self.table_prefix = None  # namespace
        self.transport = config["DEFAULT_TRANSPORT"]
        self.protocol = config["DEFAULT_PROTOCOL"]
        self.conn = self.connect()

    def connect(self):
        conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
                                    table_prefix=self.table_prefix, compat=self.compat,
                                    transport=self.transport, protocol=self.protocol)
        return conn

    def create_hb_table(self, table_name, **families):
        self.conn.create_table(table_name, families)

    def single_put(self, table_name, row_key, column, data):
        hb = happybase.Table(table_name, self.conn)
        hb.put(row_key,
               data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})

    def batch_put(self, table, row_key_name, column, datas, batch_size=1):
        hb = happybase.Table(table, self.conn)
        datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
        for x in datas_new:
            with hb.batch(batch_size=batch_size) as batch:
                for da in x:
                    da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
                    row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
                    batch.put(row_key, da_nw)
        return batch

    def single_put_self(self, table_name, row_keys, datas):
        hb = happybase.Table(table_name, self.conn)
        for row_key, (_, val) in zip(row_keys, datas.items()):
            hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
                             'maybe_table_name:maybe_column_name2': "%s" % val[1]})

    def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
                   filter=None):
        hb = happybase.Table(table, self.conn)
        scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
        hb_dict = dict(scan)
        if hb_dict:
            return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
                    for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def query_single_line(self, table, row_key):
        conn = self.connect()
        hb = happybase.Table(table, conn)
        hb_dict = hb.row(row_key)
        if hb_dict:
            return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
        else:
            return {}

    def query_multi_lines(self, table, row_keys):
        hb = happybase.Table(table, self.conn)
        hb_dict = dict(hb.rows(row_keys))
        if hb_dict:
            return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def single_delete(self, table, row_key):
        hb = happybase.Table(table, self.conn)
        hb.delete(row_key)

    def test_scan(self, table):
        hb = happybase.Table(table, self.conn)
        filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
        scan = hb.scan(limit=1000, filter=filter)

        hb_dict = dict(scan)
        if hb_dict:
            return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
                    for k1, v1 in
                    hb_dict.items()}
        else:
            return {}

    def close(self):
        self.conn.close()

MongoOps

Bases: object

Source code in src/nlpertools/data_client.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class MongoOps(object):
    def __init__(self, config=global_db_config["mongo"]):
        mongo_client = MongoClient(config["uri"])
        db = mongo_client[config["db"]]
        self.collection = db[config["col"]]

    def fetch_all(self):
        """
        读取所有数据
        :return:
        """
        ans = []
        print('提取所有数据.')
        for record in self.collection.find({}):
            record['_id'] = str(record['_id'])
            ans.append(record)
        return ans

    def load_from_mongo(self, special_value):
        """
        读取mongodb该special_value下所有值为special_value的数据
        :param
        :return:
        """
        record = self.collection.find({"{}".format(special_value): special_value})
        record = list(record)
        if not record:
            return None
        else:
            record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
            return record

    def delete_all(self):
        query = {}
        deleted = self.collection.delete_many(query)
        return deleted

    def delete_by_time(self, time):
        query = {"name": {"$regex": "^F"}}
        deleted = self.collection.delete_many(query)

    def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
        query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
        sort_sql = [("query_time", -1)]
        ans = []
        print('提取所有数据.')
        for record in self.collection.find(query).sort(sort_sql):
            record['_id'] = str(record['_id'])
            ans.append(record)
        return ans

    def save_to_mongo(self, special_value, each_item):
        """
        数据存入mongo
        :param special_value:
        :param each_item:
        :return:
        """
        query = self.collection.find({"{}".format(special_value): special_value})
        if list(query):
            self.collection.update_one({"{}".format(special_value): special_value},
                                       {"$push": {'each_item': each_item}})
        else:
            insert_item = {
                "special_value": special_value,
                "each_item": [each_item]
            }
            self.collection.insert_one(insert_item)
        print("update success")

    def insert_one(self, data):
        self.collection.insert_one(data)

    def update_to_mongo(self, condition_term, condition_value, new_value):
        """
        根据提供的字段和值,查询出对应的数据,更新数据存入mongo
        类似 updata
        :param condition_term: 条件字段term
        :param condition_value: 条件字段值
        :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
        :return:
        """
        query = self.collection.find({condition_term: condition_value})
        if list(query):
            self.collection.update_one({condition_term: condition_value},
                                       {"$push": new_value})
        else:
            insert_item = {
                condition_term: condition_value,
                "processed_data": new_value
            }
            self.collection.insert_one(insert_item)
        print("update success")

fetch_all()

读取所有数据

Returns:

Type Description
Source code in src/nlpertools/data_client.py
80
81
82
83
84
85
86
87
88
89
90
def fetch_all(self):
    """
    读取所有数据
    :return:
    """
    ans = []
    print('提取所有数据.')
    for record in self.collection.find({}):
        record['_id'] = str(record['_id'])
        ans.append(record)
    return ans

load_from_mongo(special_value)

读取mongodb该special_value下所有值为special_value的数据

Returns:

Type Description
Source code in src/nlpertools/data_client.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def load_from_mongo(self, special_value):
    """
    读取mongodb该special_value下所有值为special_value的数据
    :param
    :return:
    """
    record = self.collection.find({"{}".format(special_value): special_value})
    record = list(record)
    if not record:
        return None
    else:
        record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
        return record

save_to_mongo(special_value, each_item)

数据存入mongo

Parameters:

Name Type Description Default
special_value required
each_item required

Returns:

Type Description
Source code in src/nlpertools/data_client.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def save_to_mongo(self, special_value, each_item):
    """
    数据存入mongo
    :param special_value:
    :param each_item:
    :return:
    """
    query = self.collection.find({"{}".format(special_value): special_value})
    if list(query):
        self.collection.update_one({"{}".format(special_value): special_value},
                                   {"$push": {'each_item': each_item}})
    else:
        insert_item = {
            "special_value": special_value,
            "each_item": [each_item]
        }
        self.collection.insert_one(insert_item)
    print("update success")

update_to_mongo(condition_term, condition_value, new_value)

根据提供的字段和值,查询出对应的数据,更新数据存入mongo 类似 updata

Parameters:

Name Type Description Default
condition_term

条件字段term

required
condition_value

条件字段值

required
new_value

新的值。最好是dict,不是dict的话不知道行不行

required

Returns:

Type Description
Source code in src/nlpertools/data_client.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def update_to_mongo(self, condition_term, condition_value, new_value):
    """
    根据提供的字段和值,查询出对应的数据,更新数据存入mongo
    类似 updata
    :param condition_term: 条件字段term
    :param condition_value: 条件字段值
    :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
    :return:
    """
    query = self.collection.find({condition_term: condition_value})
    if list(query):
        self.collection.update_one({condition_term: condition_value},
                                   {"$push": new_value})
    else:
        insert_item = {
            condition_term: condition_value,
            "processed_data": new_value
        }
        self.collection.insert_one(insert_item)
    print("update success")

CopyFunc

Source code in src/nlpertools/dataprocess.py
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
class CopyFunc():
    # from https://github.com/lemon234071/clean-dialog
    def is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        return (
                (cp >= 0x4E00 and cp <= 0x9FFF)
                or (cp >= 0x3400 and cp <= 0x4DBF)  #
                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
                or (cp >= 0xF900 and cp <= 0xFAFF)
                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        )

    def contains_Chinese(seq):
        for char in seq:
            cp = ord(char)
            if is_chinese_char(cp):
                return True
        return False

is_chinese_char(cp)

Checks whether CP is the codepoint of a CJK character.

Source code in src/nlpertools/dataprocess.py
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
class CopyFunc():
    # from https://github.com/lemon234071/clean-dialog
    def is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        return (
                (cp >= 0x4E00 and cp <= 0x9FFF)
                or (cp >= 0x3400 and cp <= 0x4DBF)  #
                or (cp >= 0x20000 and cp <= 0x2A6DF)  #
                or (cp >= 0x2A700 and cp <= 0x2B73F)  #
                or (cp >= 0x2B740 and cp <= 0x2B81F)  #
                or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
                or (cp >= 0xF900 and cp <= 0xFAFF)
                or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        )

    def contains_Chinese(seq):
        for char in seq:
            cp = ord(char)
            if is_chinese_char(cp):
                return True
        return False

Pattern

pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1])) a = "󘯦asdasdas v啊实打实v阿松大" res = re.sub(pattern_special_char, "$",a)

Source code in src/nlpertools/dataprocess.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class Pattern:
    """
    >>> pattern_special_char = re.compile("[{}{}]".format(pattern_special_char_x[1:-1], pattern_special_char_u[1:-1]))
        a = "\U000d8be6asdasdas \x00v啊实打实\x00\x00v阿松大\x00"
        res = re.sub(pattern_special_char, "$",a)
    """

    # some from data-prepare

    # emoji
    """
    # 这也是emoji的取法,不知道pattern全不全
    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI
    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
    """
    emoji_pattern = "[\U00010000-\U0010ffff\\uD800-\\uDBFF\\uDC00-\\uDFFF]"

    # 特殊的乱码或不可见字符
    # \x 09:\t 0a:\n 0d:\r
    special_char_x_pattern = "[\x00-\x08\x0b\x0c\x0e\x0f\x10-\x19\x1a-\x1f]"
    # 统计大规模语料出来的非正常字符
    special_char_u_pattern = (
        "[\u3000\U000d8be6\U000e0062\U000e0063\U000e0067\U000e0073\U000e0074\U000e007f]"
    )
    special_char_pattern = "{}{}".format(
        special_char_x_pattern[1:-1], special_char_u_pattern[1:-1]
    )
    non_printing_characters_pattern = (
        f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
    )

    # 必须从头匹配,否则无意义的
    # 中文人名
    chinese_name_pattern = "(?:[\u4e00-\u9fa5·]{2,3})"
    # 英文人名
    english_name_pattern = "(^[a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]$)"
    # 纯数字
    pure_num_pattern = "\d+"
    # xxxx图/表 之类的表述
    pic_table_descript_pattern = ".{1,15}图"

    # 无需从头匹配的。
    # hlink
    hlink_pattern = (
        r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
    )
    http_pattern = "(http|https):\/\/([\w.]+\/?)\S*/\S*"
    # 邮箱
    email_pattern = "[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+"
    # html 可能过于严格了
    html_pattern = "<[\s\S]*?>"
    # 重复 “asdasdasdasd”
    repeat_pattern = "(.)\1+"
    # 日期
    day_time_pattern = "\d{1,4}(-)(1[0-2]|0?[1-9])\1(0?[1-9]|[1-2]\d|30|31)"
    # 小时
    hour_time_pattern = "(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
    # 股票
    stock_pattern = (
        "(s[hz]|S[HZ])(000[\d]{3}|002[\d]{3}|300[\d]{3}|600[\d]{3}|60[\d]{4})"
    )

    # 一般是需要替换的
    # 多余空格 => " "
    redundancy_space_pattern = " +"
    # 一般用不到 多余换行符号 => " "
    linebreak_pattern = "[\r\n\t]+"

    # 微博视频等
    weibo_pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
    # @
    at_pattern = "@\w+"

    # from https://github.com/bigscience-workshop/data-preparation pii
    year_patterns = [
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-yyyy or yyyy/yyyy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # mm-yyyy or mm/yyyy or the same but with yy
        r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])",
        # yyyy-mm or yyyy/mm
    ]

    # Patterns for high-risk character strings
    id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
    # https://regex101.com/r/JQkmh8/2
    # key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
    # https://regex101.com/r/JQkmh8/5
    key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
    ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
    ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
    ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join(
        [ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"

    # https://regex101.com/r/EpA5B7/1
    email_line_pattern = r'''
        (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
        (
        [^\b\s@?!;,:)('"<]+
        @
        [^\b\s@!?;,/]*
        [^\b\s@?!;,/:)('">.]
        \.
        \p{L} \w{1,}
        )
        (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
    '''

    # https://regex101.com/r/mOqi1s/3
    # user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
    user_pattern = r'''
    (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
    (@
        [^)(\s@,?!;:'"]{3,}
    )
    '''

TextProcess

Bases: object

数据处理类 这是基类,如果是定制化的语言处理,请继承该类

Source code in src/nlpertools/dataprocess.py
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
class TextProcess(object):
    """
    数据处理类
    这是基类,如果是定制化的语言处理,请继承该类
    """

    def __init__(
            self,
            patterns_filter: List = None,
            patterns_replace: List[List] = None,
            words_filter: List = []
    ):
        """
        pattern_list:
        """
        self.patterns_filter, self.patterns_replace = self._pre_complie_pattern(
            patterns_filter, patterns_replace
        )
        self.words_filter = words_filter

    @staticmethod
    def _pre_complie_pattern(patterns_filter, patterns_replace):
        complied_patterns_replace, complied_patterns_filter = [], []
        for i in patterns_filter:
            complied_patterns_filter.append(re.compile(i))
        for i in patterns_replace:
            complied_patterns_replace.append((re.compile(i[0]), i[1]))
        return complied_patterns_filter, complied_patterns_replace

    def process(self, text):
        # 进来的数据都要做的标准化
        text = self.full2half(text)
        # text = self.filter_http(text)
        text = self.filter_html(text)
        text = self.filter_html_special(text)
        # 根据类型与语言分别处理
        text = self.filter_exclusive(text)
        # text = self.trandition2simple(text)
        # text = self.remove_stopwords(text)
        return text

    def filter_words(self, text):
        # 根据词典,命中返回True,需要过滤掉

        for word in self.words_filter:
            if word in text:
                return True
        return False

    def filter_whitelist(self, text):
        whitelist = re.compile(
            "[^\u4e00-\u9fa5^0-9a-zA-Z^-^《^》^<^>^【^】^(^)^{^}^–^…^”^“^,^.^;^?^:^‘^~^`^,^。^?^;^!^:^、^·^!^@^#^$^%^&^(^)^|]"
        )
        text = whitelist.sub("", text)
        return text

    def text_split(self, text, language):
        if language == "en":
            text = text[:256]
        elif language == "zh":
            text = text[:510]
        return text

    def trandition2simple(self, text):
        # 仅对中文
        """
        https://juejin.cn/post/7234554420163100728
        """
        text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
        return text

    def remove_stopwords(self, text):
        new_tokens = []
        if self.language == "en":
            tokens = text.split(" ")
        else:
            tokens = jieba.lcut(text)

        for i in tokens:
            if i in self.stopwords:
                pass
            else:
                new_tokens.append(i)

        return new_tokens

    @staticmethod
    def split_sentence(sentence, language="chinese"):
        """
        分句,英文有nltk,中文怎么能没有好的分句工具呢
        :param sentence:
        :param language:
        :return:
        """
        # sentences->Str
        # example '12“345。”“6789”'
        assert language in ["chinese", "english"], "unsupportable for other language"
        if language == "chinese":
            split_signs = list("。!?…\t")
            other_sign = "”"
        elif language == "english":
            split_signs = list(".!?")
            other_sign = '"'
        else:
            split_signs = list(".!?")
            other_sign = '"'
        sentences = []
        start_idx = 0
        for idx, char in enumerate(sentence):
            if idx == len(sentence) - 1:
                if char in split_signs:
                    sentences.append(sentence[start_idx: idx + 1].strip())
                    start_idx = idx + 1
                else:
                    sentences.append(sentence[start_idx:].strip())
            else:
                if char in split_signs:
                    if sentence[idx + 1] == other_sign:
                        if idx < len(sentence) - 2:
                            # 处理。”。
                            if sentence[idx + 2] not in split_signs:
                                sentences.append(sentence[start_idx: idx + 2].strip())
                                start_idx = idx + 2
                    elif sentence[idx + 1] not in split_signs:
                        sentences.append(sentence[start_idx: idx + 1].strip())
                        start_idx = idx + 1
        sentences = [i.strip() for i in sentences if i.strip()]
        return sentences

    def cut_word(self, text, language):
        if language == "en":
            tokens = text.split(" ")
        else:
            tokens = jieba.lcut(text)
        return tokens

    def full2half(self, text):
        """
        全角转化为半角
        :param text:
        :return:
        """
        ret_str = ""
        for i in text:
            if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
                ret_str += chr(ord(i) - 65248)
            else:
                ret_str += i
        return ret_str

    def filter_html(self, text):
        # 这个比较严格
        """
        过滤html标签
        :param text:
        :return:
        """
        patterns = [
            re.compile("//<![CDATA[[^>]*//]]>", re.I),  # 匹配CDATA
            re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I),  # Script
            re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I),  # style
            re.compile("<brs*?/?>"),  # 处理换行
            re.compile("</?w+[^>]*>"),  # HTML标签
            re.compile("<!--[^>]*-->"),  # HTML注释
        ]
        for pattern in patterns:
            text = pattern.sub("", text)
        return text

    def filter_html_special(self, text):
        """
        替换所有html转义字符
        这个好像只有新闻有?
        :param text:
        :return:
        """
        # TODO html标签应该是 &nbsp 这种,\xa0也是吗
        CHAR_ENTITIES = {
            "&nbsp": " ",
            "160": " ",
            "lt": "<",
            "60": "<",
            "gt": ">",
            "62": ">",
            "amp": "&",
            "38": "&",
            "quot": '"',
            "34": '"',
            "ldquo": '"',
            "rdquo": '"',
            "mdash": "",
            "\xa0": "",
        }

        re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
        sz = re.search(re_charEntity, text)
        while sz:
            entity = sz.group()  # entity全称,如>
            key = sz.group("name")  # 去除&;后entity,如>为gt
            try:
                htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
                text = htmlstr
                sz = re.search(re_charEntity, htmlstr)
            except KeyError:
                # 以空串代替
                htmlstr = re_charEntity.sub("", text, 1)
                text = htmlstr
                sz = re_charEntity.search(htmlstr)
        return text

    def filter_exclusive(self, text):
        """
        去除 @、 #、 表情等twitter、微博“特有”的情况
        :return:
        """
        pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
        p = re.compile(pattern, re.S)
        text = p.sub("", text)

        dr = re.compile("@\w+", re.S)
        text = dr.sub("", text)

        return text

    def filter_html_tag(self, text):
        # res_tr = r'<a (.*?)></a>'
        # m_tr = re.findall(res_tr,text,re.S|re.M)
        res = re.sub(r"<a.*?>", "", text)
        res = re.sub(r"</a>", "", res)
        res = re.sub(r"<span.*?>", "", res)
        res = re.sub(r"</span>", "", res)
        res = re.sub(r"<img.*?>", "", res)
        res = re.sub(r"<br.*?>", "", res)
        res = re.sub(r"//", "", res)
        res = re.sub(r"@", "", res)
        res = re.sub(r"</", "", res)
        # res = re.sub(r',', '', res)
        # res = re.sub(r'&nbsp;', '', res)
        return res

    @staticmethod
    def uniform_whitespace(
            document,
            whitespace=[
                " ",
                " ",
                " ",
                " ",
                " ",
                " ",
                " ",
                " ",
                " ",
                " ",
                "",
                "„",
            ],
    ):
        # from https://github.com/bigscience-workshop/data-preparation
        """There are different whitespace characters."""
        whitespace = set(whitespace)
        document = "".join(
            [char if char not in whitespace else " " for char in document]
        )
        return document

    def filter_pattern(self, text):
        """
        返回True表示命中规则,需要过滤
        """
        for pattern in self.patterns_filter:
            if re.match(pattern, text):
                return True
        return False

    def replace_pattern(self, text):
        for pattern, replace in self.patterns_replace:
            text = re.sub(pattern, replace, text)
        return text

__init__(patterns_filter=None, patterns_replace=None, words_filter=[])

pattern_list:

Source code in src/nlpertools/dataprocess.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
"""

def __init__(
        self,
        patterns_filter: List = None,
        patterns_replace: List[List] = None,
        words_filter: List = []
):
    """
    pattern_list:
    """
    self.patterns_filter, self.patterns_replace = self._pre_complie_pattern(
        patterns_filter, patterns_replace
    )
    self.words_filter = words_filter

filter_exclusive(text)

去除 @、 #、 表情等twitter、微博“特有”的情况

Returns:

Type Description
Source code in src/nlpertools/dataprocess.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
    return text

def filter_exclusive(self, text):
    """
    去除 @、 #、 表情等twitter、微博“特有”的情况
    :return:
    """
    pattern = r"([\s]\w+(的微博视频)|#|【|】|转发微博)"
    p = re.compile(pattern, re.S)
    text = p.sub("", text)

    dr = re.compile("@\w+", re.S)
    text = dr.sub("", text)

    return text

filter_html(text)

过滤html标签

Parameters:

Name Type Description Default
text required

Returns:

Type Description
Source code in src/nlpertools/dataprocess.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
    return ret_str

def filter_html(self, text):
    # 这个比较严格
    """
    过滤html标签
    :param text:
    :return:
    """
    patterns = [
        re.compile("//<![CDATA[[^>]*//]]>", re.I),  # 匹配CDATA
        re.compile("<s*script[^>]*>[^<]*<s*/s*scripts*>", re.I),  # Script
        re.compile("<s*style[^>]*>[^<]*<s*/s*styles*>", re.I),  # style
        re.compile("<brs*?/?>"),  # 处理换行
        re.compile("</?w+[^>]*>"),  # HTML标签
        re.compile("<!--[^>]*-->"),  # HTML注释
    ]
    for pattern in patterns:
        text = pattern.sub("", text)
    return text

filter_html_special(text)

替换所有html转义字符 这个好像只有新闻有?

Parameters:

Name Type Description Default
text required

Returns:

Type Description
Source code in src/nlpertools/dataprocess.py
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    return text

def filter_html_special(self, text):
    """
    替换所有html转义字符
    这个好像只有新闻有?
    :param text:
    :return:
    """
    # TODO html标签应该是 &nbsp 这种,\xa0也是吗
    CHAR_ENTITIES = {
        "&nbsp": " ",
        "160": " ",
        "lt": "<",
        "60": "<",
        "gt": ">",
        "62": ">",
        "amp": "&",
        "38": "&",
        "quot": '"',
        "34": '"',
        "ldquo": '"',
        "rdquo": '"',
        "mdash": "",
        "\xa0": "",
    }

    re_charEntity = re.compile(r"&#?(?P<name>\w+);", re.S)
    sz = re.search(re_charEntity, text)
    while sz:
        entity = sz.group()  # entity全称,如>
        key = sz.group("name")  # 去除&;后entity,如>为gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], text, 1)
            text = htmlstr
            sz = re.search(re_charEntity, htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub("", text, 1)
            text = htmlstr
            sz = re_charEntity.search(htmlstr)
    return text

filter_pattern(text)

返回True表示命中规则,需要过滤

Source code in src/nlpertools/dataprocess.py
512
513
514
515
516
517
518
519
520
521
    return document

def filter_pattern(self, text):
    """
    返回True表示命中规则,需要过滤
    """
    for pattern in self.patterns_filter:
        if re.match(pattern, text):
            return True
    return False

full2half(text)

全角转化为半角

Parameters:

Name Type Description Default
text required

Returns:

Type Description
Source code in src/nlpertools/dataprocess.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
    return tokens

def full2half(self, text):
    """
    全角转化为半角
    :param text:
    :return:
    """
    ret_str = ""
    for i in text:
        if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
            ret_str += chr(ord(i) - 65248)
        else:
            ret_str += i
    return ret_str

split_sentence(sentence, language='chinese') staticmethod

分句,英文有nltk,中文怎么能没有好的分句工具呢

Parameters:

Name Type Description Default
sentence required
language 'chinese'

Returns:

Type Description
Source code in src/nlpertools/dataprocess.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
    return new_tokens

@staticmethod
def split_sentence(sentence, language="chinese"):
    """
    分句,英文有nltk,中文怎么能没有好的分句工具呢
    :param sentence:
    :param language:
    :return:
    """
    # sentences->Str
    # example '12“345。”“6789”'
    assert language in ["chinese", "english"], "unsupportable for other language"
    if language == "chinese":
        split_signs = list("。!?…\t")
        other_sign = "”"
    elif language == "english":
        split_signs = list(".!?")
        other_sign = '"'
    else:
        split_signs = list(".!?")
        other_sign = '"'
    sentences = []
    start_idx = 0
    for idx, char in enumerate(sentence):
        if idx == len(sentence) - 1:
            if char in split_signs:
                sentences.append(sentence[start_idx: idx + 1].strip())
                start_idx = idx + 1
            else:
                sentences.append(sentence[start_idx:].strip())
        else:
            if char in split_signs:
                if sentence[idx + 1] == other_sign:
                    if idx < len(sentence) - 2:
                        # 处理。”。
                        if sentence[idx + 2] not in split_signs:
                            sentences.append(sentence[start_idx: idx + 2].strip())
                            start_idx = idx + 2
                elif sentence[idx + 1] not in split_signs:
                    sentences.append(sentence[start_idx: idx + 1].strip())
                    start_idx = idx + 1
    sentences = [i.strip() for i in sentences if i.strip()]
    return sentences

trandition2simple(text)

https://juejin.cn/post/7234554420163100728

Source code in src/nlpertools/dataprocess.py
309
310
311
312
313
314
315
316
317
    return text

def trandition2simple(self, text):
    # 仅对中文
    """
    https://juejin.cn/post/7234554420163100728
    """
    text = zhconv.convert("我幹什麼不干你事。", "zh-cn")
    return text

uniform_whitespace(document, whitespace=[' ', '\u2009', '\u200a', '\u202f', '\u2005', '\u3000', '\u2002', '\xa0', '\u2008', '\u2003', '', '\x84']) staticmethod

There are different whitespace characters.

Source code in src/nlpertools/dataprocess.py
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
    return res

@staticmethod
def uniform_whitespace(
        document,
        whitespace=[
            " ",
            " ",
            " ",
            " ",
            " ",
            " ",
            " ",
            " ",
            " ",
            " ",
            "",
            "„",
        ],
):
    # from https://github.com/bigscience-workshop/data-preparation
    """There are different whitespace characters."""
    whitespace = set(whitespace)
    document = "".join(
        [char if char not in whitespace else " " for char in document]
    )
    return document

DataAnalysis

Source code in src/nlpertools/ml.py
20
21
22
23
24
25
26
27
28
29
30
31
32
class DataAnalysis:
    @staticmethod
    def draw_pic(df, save_path):
        """
        画直方图,对比两个不同类别差异
        :param df: pd.DataFrame
        :param save_path: str
        :return: 
        """
        sns.distplot(df[df["label"] == 1]["feature"], label="label1")
        sns.distplot(df[df["label"] == 0]["feature"], label="label2")
        plt.legend()
        plt.savefig(save_path)

draw_pic(df, save_path) staticmethod

画直方图,对比两个不同类别差异

Parameters:

Name Type Description Default
df

pd.DataFrame

required
save_path

str

required

Returns:

Type Description
Source code in src/nlpertools/ml.py
21
22
23
24
25
26
27
28
29
30
31
32
@staticmethod
def draw_pic(df, save_path):
    """
    画直方图,对比两个不同类别差异
    :param df: pd.DataFrame
    :param save_path: str
    :return: 
    """
    sns.distplot(df[df["label"] == 1]["feature"], label="label1")
    sns.distplot(df[df["label"] == 0]["feature"], label="label2")
    plt.legend()
    plt.savefig(save_path)

STEM

Bases: object

Source code in src/nlpertools/ml.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class STEM(object):

    def __init__(self, IPT_MODEL_PATH):
        self.ltp = LTP(IPT_MODEL_PATH)

    def start_by_dep(self, sentence):
        seg, hidden = self.ltp.seg([sentence])
        dep = self.ltp.dep(hidden)  # , graph=False)
        seg, dep = seg[0], dep[0]
        for i in dep:
            # 主谓宾
            if 'SBV' == i[2]:
                subject = seg[i[0]]
                verb = seg[i[1]]
            if 'VOB' in i[2]:
                if seg[i[1]] == verb:
                    object = seg[i[0]]

                return subject

        return None

    def start_by_srl(self, sentence):
        """
        用语义角色标注工具
        :param sentence: "他叫汤姆去拿外衣。"
        :return:  events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]
        """
        # 语义角色标注方法
        seg, hidden = self.ltp.seg([sentence])
        srl = self.ltp.srl(hidden)
        seg, srl = seg[0], srl[0]
        events = []
        for wdx, each_srl in enumerate(srl):
            if each_srl:
                args = []
                for arg in each_srl:
                    args.extend(seg[arg[1]:arg[2] + 1])
                # 添加上谓词
                args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
                events.append(args)
        # print(events)
        return events

start_by_srl(sentence)

用语义角色标注工具

Parameters:

Name Type Description Default
sentence

"他叫汤姆去拿外衣。"

required

Returns:

Type Description

events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]

Source code in src/nlpertools/ml.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def start_by_srl(self, sentence):
    """
    用语义角色标注工具
    :param sentence: "他叫汤姆去拿外衣。"
    :return:  events: [['他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '拿', '外衣']]
    """
    # 语义角色标注方法
    seg, hidden = self.ltp.seg([sentence])
    srl = self.ltp.srl(hidden)
    seg, srl = seg[0], srl[0]
    events = []
    for wdx, each_srl in enumerate(srl):
        if each_srl:
            args = []
            for arg in each_srl:
                args.extend(seg[arg[1]:arg[2] + 1])
            # 添加上谓词
            args.insert(each_srl[0][2] - each_srl[0][1] + 1, seg[wdx])
            events.append(args)
    # print(events)
    return events

convert_crf_format_10_fold(corpus, objdir_path)

把已经是crf格式的数据,分成十折。 para:

Source code in src/nlpertools/ml.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def convert_crf_format_10_fold(corpus, objdir_path):
    '''
    把已经是crf格式的数据,分成十折。
    para:

    '''
    # corpus = list(range(1,22))
    j_mkdir(objdir_path)
    split_position = int(len(corpus) / 10)
    for k in range(0, 10):
        if k == 9:
            dev_set = corpus[k * split_position:]
            train_set = corpus[:k * split_position]
        else:
            dev_set = corpus[k * split_position: (k + 1) * split_position]
            train_set = corpus[:k * split_position] + corpus[(k + 1) * split_position:]
        writetxt_w_list(train_set, os.path.join(objdir_path, 'train{}.txt'.format(k + 1)))
        writetxt_w_list(dev_set, os.path.join(objdir_path, 'test{}.txt'.format(k + 1)))
        writetxt_w_list(dev_set, os.path.join(objdir_path, 'dev{}.txt'.format(k + 1)))

kfold(corpus, path, k=9, is_shuffle=True)

k是10份中训练集占了几份

Source code in src/nlpertools/ml.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def kfold(corpus, path, k=9, is_shuffle=True):
    '''
    k是10份中训练集占了几份
    '''
    j_mkdir(path)
    if is_shuffle:
        random.shuffle(corpus)
    split_position = int(len(corpus) / 10)
    train_set, dev_set = corpus[:k * split_position], corpus[k * split_position:]
    writetxt_w_list(train_set, os.path.join(path, 'train.tsv'), num_lf=1)
    writetxt_w_list(dev_set, os.path.join(path, 'test.tsv'), num_lf=1)
    writetxt_w_list(dev_set, os.path.join(path, 'dev.tsv'), num_lf=1)
    """
    import pandas as pd
    from sklearn.model_selection import KFold

    df = pd.DataFrame({
        "text": ["text_{}".format(i) for i in range(100)],
        "labels": ["label_{}".format(i % 10) for i in range(100)]
    })
    train_idx, test_and_val_idx = KFold(n_splits=8, shuffle=True).split(df).__next__()
    df_test_and_val = df.iloc[test_and_val_idx]
    test_idx, val_idx = KFold(n_splits=2, shuffle=True).split(df_test_and_val).__next__()
    df_train = df.iloc[train_idx]
    df_val = df.iloc[val_idx]
    df_test = df.iloc[test_idx]
    print(train_idx)
    print(val_idx)
    print(test_idx)
    """

label(text, labels)

返回两列的标记数据序列

Parameters:

Name Type Description Default
text required
labels required

Returns:

Type Description
Source code in src/nlpertools/ml.py
175
176
177
178
179
180
181
182
183
184
def label(text, labels):
    '''
    返回两列的标记数据序列
    :param text:
    :param labels:
    :return:
    '''
    train_sequence = '\n'.join(
        ['\t'.join(i) if i[0] != ' ' else '[null]\t{}'.format(i[1]) for i in zip(list(text), labels)])
    return train_sequence

read_seq_res(path, labels)

读序列标注三列数据的方法

Parameters:

Name Type Description Default
path required
labels required

Returns:

Type Description
Source code in src/nlpertools/ml.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def read_seq_res(path, labels):
    '''
    读序列标注三列数据的方法
    :param path:
    :param labels:
    :return:
    '''
    with codecs.open(path, 'r', 'utf-8') as rd:
        seqs_str = rd.read().strip()
    seqs_list = seqs_str.split('\n\n')
    text, raw_label, predict_label = [], [], []
    for seq in seqs_list:
        seq_split = seq.split('\n')
        text_tmp = ''
        raw_index_dict, pre_index_dict = {}, {}
        for label in labels:
            raw_index_dict.setdefault(label, [])
            pre_index_dict.setdefault(label, [])
        for idx, line in enumerate(seq_split):
            tmp = line.split('\t')
            text_tmp += tmp[0]
            if tmp[1] in labels:
                raw_index_dict[tmp[1]].append(idx)
            if tmp[2] in labels:
                pre_index_dict[tmp[2]].append(idx)
        text.append(text_tmp)
        raw_label.append(raw_index_dict)
        predict_label.append(pre_index_dict)
    return text, raw_label, predict_label

seed_everything(seed=7777777)

设置整个开发环境的seed

Parameters:

Name Type Description Default
seed 7777777
device required

Returns:

Type Description
None
Source code in src/nlpertools/ml.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
def seed_everything(seed=7777777) -> None:
    """
    设置整个开发环境的seed
    :param seed:
    :param device:
    :return:
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)  # CPU随机种子确定
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

split_sentence(sentence, language='chinese', cross_line=True)

分句,英文有nltk,中文怎么能没有好的分句工具呢

Parameters:

Name Type Description Default
sentence required
language 'chinese'
cross_line True

Returns:

Type Description
Source code in src/nlpertools/ml.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def split_sentence(sentence, language='chinese', cross_line=True):
    """
    分句,英文有nltk,中文怎么能没有好的分句工具呢
    :param sentence:
    :param language:
    :param cross_line:
    :return:
    """
    # sentences->Str
    # example '12“345。”“6789”'
    assert language in ["chinese", "english"], "unsupportable for other language"
    sentence = sentence.replace("\r", "")
    if language == 'chinese':
        split_signs = list('。!?…')
        if cross_line:
            split_signs.append("\n")
        other_sign = "”"
    elif language == 'english':
        split_signs = list('.!?')
        other_sign = '"'
    else:
        split_signs = list('.!?')
        other_sign = '"'
    sentences = []
    start_idx = 0
    for idx, char in enumerate(sentence):
        if idx == len(sentence) - 1:
            if char in split_signs:
                sentences.append(sentence[start_idx:idx + 1].strip())
                start_idx = idx + 1
            else:
                sentences.append(sentence[start_idx:].strip())
        else:
            if char in split_signs:
                if sentence[idx + 1] == other_sign:
                    if idx < len(sentence) - 2:
                        # 处理。”。
                        if sentence[idx + 2] not in split_signs:
                            sentences.append(sentence[start_idx:idx + 2].strip())
                            start_idx = idx + 2
                elif sentence[idx + 1] not in split_signs:
                    sentences.append(sentence[start_idx:idx + 1].strip())
                    start_idx = idx + 1
    return sentences

subject_object_labeling(spo_list, text)

百度那种有spo字典的数据,给标成。草,看不懂,得找找哪里用的

Parameters:

Name Type Description Default
spo_list required
text required

Returns:

Type Description

labeling_list

Source code in src/nlpertools/ml.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def subject_object_labeling(spo_list, text):
    # TODO
    '''
    百度那种有spo字典的数据,给标成。草,看不懂,得找找哪里用的
    :param spo_list:
    :param text:
    :return: labeling_list
    '''

    def _spo_list_to_spo_predicate_dict(spo_list):
        spo_predicate_dict = dict()
        for spo_item in spo_list:
            predicate = spo_item["predicate"]
            subject = spo_item["subject"]
            object = spo_item["object"]
            spo_predicate_dict.setdefault(predicate, []).append((subject, object))
        return spo_predicate_dict

    def _index_q_list_in_k_list(q_list, k_list):
        """Known q_list in k_list, find index(first time) of q_list in k_list"""
        q_list_length = len(q_list)
        k_list_length = len(k_list)
        for idx in range(k_list_length - q_list_length + 1):
            t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
            # print(idx, t)
            if all(t):
                # print(idx)
                idx_start = idx
                return idx_start

    def _labeling_type(spo, spo_type):
        idx_start = _index_q_list_in_k_list(q_list=spo, k_list=text)
        labeling_list[idx_start] = 'B-' + spo_type
        if len(spo) == 2:
            labeling_list[idx_start + 1] = 'I-' + spo_type
        elif len(spo) >= 3:
            labeling_list[idx_start + 1: idx_start + len(spo)] = ['I-' + spo_type] * (len(spo) - 1)
        else:
            pass

    spo_predicate_dict = _spo_list_to_spo_predicate_dict(spo_list)
    labeling_list = ['O'] * len(text)
    # count = 0
    for predicate, spo_list_form in spo_predicate_dict.items():
        if predicate in text:
            for (spo_subject, spo_object) in spo_list_form:
                # if predicate not in spo_subject and predicate not in spo_object:
                _labeling_type(spo_subject, 'SUB')
                _labeling_type(spo_object, 'OBJ')
                _labeling_type(predicate, 'PRE')
                # count += 1
                # print(count)
                # if count == 2:
                #     print()
            if labeling_list != ['O'] * len(text):
                return labeling_list
    return None

GaussDecay

Bases: object

当前只实现了时间的,全部使用默认值

Source code in src/nlpertools/other.py
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
class GaussDecay(object):
    """
    当前只实现了时间的,全部使用默认值
    """

    def __init__(self, origin='2022-08-02', scale='90d', offset='5d', decay=0.5, task="time"):
        self.origin = origin
        self.task = task
        self.scale, self.offset = self.translate(scale, offset)
        self.decay = decay
        self.time_coefficient = 0.6
        self.related_coefficient = 0.4

    def translate(self, scale, offset):
        """
        将领域的输入转化为标准
        :return:
        """
        if self.task == "time":
            scale = 180
            offset = 5
        else:
            scale = 180
            offset = 5
        return scale, offset

    @staticmethod
    def translated_minus(field_value):
        origin = datetime.datetime.now()
        field_value = datetime.datetime.strptime(field_value, '%Y-%m-%d %H:%M:%S')
        return (origin - field_value).days

    def calc_exp(self):
        pass

    def calc_liner(self):
        pass

    def calc_gauss(self, raw_score, field_value):
        """
        $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
        $$σ^2=-scale^2/(2·ln(decay))$$
        :param raw_score:
        :param field_value:
        :return:
        """
        numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
        sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
        denominator = 2 * sigma_square
        s = math.exp(-1 * numerator / denominator)
        return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)

calc_gauss(raw_score, field_value)

$$S(doc)=exp(- rac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ - $$σ^2=-scale^2/(2·ln(decay))$$

Parameters:

Name Type Description Default
raw_score required
field_value required

Returns:

Type Description
Source code in src/nlpertools/other.py
350
351
352
353
354
355
356
357
358
359
360
361
362
def calc_gauss(self, raw_score, field_value):
    """
    $$S(doc)=exp(-\frac{max(0,|fieldvalues_{doc}-origin|-offset)^2}{2σ^2})$$ -
    $$σ^2=-scale^2/(2·ln(decay))$$
    :param raw_score:
    :param field_value:
    :return:
    """
    numerator = max(0, (abs(self.translated_minus(field_value)) - self.offset)) ** 2
    sigma_square = -1 * self.scale ** 2 / (2 * math.log(self.decay, math.e))
    denominator = 2 * sigma_square
    s = math.exp(-1 * numerator / denominator)
    return round(self.time_coefficient * s + self.related_coefficient * raw_score, 7)

translate(scale, offset)

将领域的输入转化为标准

Returns:

Type Description
Source code in src/nlpertools/other.py
325
326
327
328
329
330
331
332
333
334
335
336
def translate(self, scale, offset):
    """
    将领域的输入转化为标准
    :return:
    """
    if self.task == "time":
        scale = 180
        offset = 5
    else:
        scale = 180
        offset = 5
    return scale, offset

auto_close()

针对企业微信15分钟会显示离开的机制,假装自己还在上班

Source code in src/nlpertools/other.py
275
276
277
278
279
280
281
282
283
284
285
286
287
def auto_close():
    """
    针对企业微信15分钟会显示离开的机制,假装自己还在上班
    """
    import pyautogui as pg
    import time
    import os
    cmd = 'schtasks /create /tn shut /tr "shutdown -s -f" /sc once /st 23:30'
    os.system(cmd)
    while 1:
        pg.moveTo(970, 17, 2)
        pg.click()
        time.sleep(840)

camel_to_snake(s)

将 camel case 转换到 snake case.

Parameters:

Name Type Description Default
s str

camel case variable

required

Returns:

Type Description
str
Source code in src/nlpertools/other.py
128
129
130
131
132
133
134
def camel_to_snake(s: str) -> str:
    """
    将 camel case 转换到 snake case.
    :param s: camel case variable
    :return:
    """
    return reduce(lambda x, y: x + ('_' if y.isupper() else '') + y, s).lower()

git_push()

针对国内提交github经常失败,自动提交

Source code in src/nlpertools/other.py
103
104
105
106
107
108
109
110
111
112
113
114
115
def git_push():
    """
    针对国内提交github经常失败,自动提交
    """
    num = -1
    while 1:
        num += 1
        print("retry num: {}".format(num))
        res = os.system("git push --set-upstream origin main")
        print(str(res))
        if not str(res).startswith("fatal"):
            print("scucess")
            break

snake_to_camel(s)

author: u 将 snake case 转换到 camel case.

Parameters:

Name Type Description Default
s str

snake case variable

required

Returns:

Type Description
str
Source code in src/nlpertools/other.py
118
119
120
121
122
123
124
125
def snake_to_camel(s: str) -> str:
    """
    author: u
    将 snake case 转换到 camel case.
    :param s: snake case variable
    :return:
    """
    return s.title().replace("_", "")

spider(url)

Parameters:

Name Type Description Default
url required

Returns:

Type Description
Source code in src/nlpertools/other.py
187
188
189
190
191
192
193
194
195
196
197
198
199
def spider(url):
    """

    :param url:
    :return:
    """
    if 'baijiahao' in url:
        content = requests.get(url)
        # print(content.text)
        html = pq.PyQuery(content.text)
        title = html('.index-module_articleTitle_28fPT').text()
        res = html('.index-module_articleWrap_2Zphx').text().rstrip('举报/反馈')
        return '{}\n{}'.format(title, res)

EmailClient

Bases: object

Source code in src/nlpertools/plugin.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class EmailClient(object):
    def __init__(self):
        self.mail_user = ""
        self.mail_pass = ""
        self.receiver = ""

    def sent_email(self, title, content):
        """
        # mail_user = 'xxx'
        # mail_pass = 'xxx'
        # receiver = 'xxx'
        # sent_email(mail_user, mail_pass, receiver)
        """

        # log info
        mail_host = 'smtp.qq.com'
        mail_user = self.mail_user
        mail_pass = self.mail_pass
        sender = mail_user

        # email info
        message = MIMEText(content, 'plain', 'utf-8')
        message['Subject'] = title
        message['From'] = sender
        message['To'] = self.receiver

        # log and send
        try:
            smtpObj = smtplib.SMTP()
            smtpObj.connect(mail_host, 25)
            smtpObj.login(mail_user, mail_pass)
            smtpObj.sendmail(sender, self.receiver, message.as_string())
            smtpObj.quit()
            print('send email succes')
        except smtplib.SMTPException as e:
            print('erro', e)

sent_email(title, content)

mail_user = 'xxx'

mail_pass = 'xxx'

receiver = 'xxx'

sent_email(mail_user, mail_pass, receiver)

Source code in src/nlpertools/plugin.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def sent_email(self, title, content):
    """
    # mail_user = 'xxx'
    # mail_pass = 'xxx'
    # receiver = 'xxx'
    # sent_email(mail_user, mail_pass, receiver)
    """

    # log info
    mail_host = 'smtp.qq.com'
    mail_user = self.mail_user
    mail_pass = self.mail_pass
    sender = mail_user

    # email info
    message = MIMEText(content, 'plain', 'utf-8')
    message['Subject'] = title
    message['From'] = sender
    message['To'] = self.receiver

    # log and send
    try:
        smtpObj = smtplib.SMTP()
        smtpObj.connect(mail_host, 25)
        smtpObj.login(mail_user, mail_pass)
        smtpObj.sendmail(sender, self.receiver, message.as_string())
        smtpObj.quit()
        print('send email succes')
    except smtplib.SMTPException as e:
        print('erro', e)

convert_import_string_to_import_list(text)

该方法将 import 转变为 try import

Source code in src/nlpertools/utils_for_nlpertools.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def convert_import_string_to_import_list(text):
    """
    该方法将 import 转变为 try import
    """
    models_to_import = []
    import_list = text.split("\n")
    for each in import_list:
        print(each)
        name, package, as_name = None, None, None
        elements = each.split(" ")
        for pre, cur in zip(elements, elements[1:]):
            if cur.endswith(","):
                cur = cur.rstrip(",")
            # 为了实现from import 和 import统一,首先把package和name的含义反过来,后面再掉换
            if pre == "import":
                package = cur
            if pre == "from":
                name = cur
            if pre == "as":
                as_name = cur
            if pre[-1] == ",":
                # 针对 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
                # 将将前面部分和当前的组成新字段
                prefix = each.split("import")[0]
                import_list.append("{}import {}".format(prefix, cur))
        if not as_name:
            as_name = package.split(".")[-1]
        if not name:
            name, package = package, name
        models_to_import.append((name, package, as_name))
    # 打印
    all_import_info = ["", "from utils_for_nlpertools import try_import", ""]
    for name, package, as_name in models_to_import:
        import_info = '{} = try_import("{}", {})'.format(as_name, name, '"{}"'.format(package) if package else package)
        all_import_info.append(import_info)
        print(import_info)
    return all_import_info

fn_async_timer(function)

针对异步函数的装饰器

Source code in src/nlpertools/wrapper.py
10
11
12
13
14
15
16
17
18
19
20
21
22
def fn_async_timer(function):
    """
    针对异步函数的装饰器
    """
    @wraps(function)
    async def function_timer(*args, **kwargs):
        t0 = time.time()
        result = await function(*args, **kwargs)
        t1 = time.time()
        print('[finished {func_name} in {time:.2f}s]'.format(func_name=function.__name__, time=t1 - t0))
        return result

    return function_timer

fn_timeout_checker(wait_time, callback)

超时判断的装饰器 两个包,使用gevent出现bug

Source code in src/nlpertools/wrapper.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def fn_timeout_checker(wait_time, callback):
    """
    超时判断的装饰器
    两个包,使用gevent出现bug
    """
    # from gevent import Timeout
    # from gevent.monkey import patch_all

    # patch_all() # thread=False加了这个参数,配合flask app的threaded=True,会报错,目前还没有理解阻塞,线程之间的关系。不加即thread=True时没问题

    from eventlet import Timeout
    from eventlet import monkey_patch

    monkey_patch(time=True)

    def wrapper(func):
        def inner(*args, **kwargs):
            finish_flag = False
            with Timeout(wait_time, False):
                res = func(*args, **kwargs)
                finish_flag = True
            if not finish_flag:
                res = callback()
            return res

        return inner

    return wrapper

fn_timer(async_func=False, analyse=False)

@fn_timer() def example(): time.sleep(2)

Parameters:

Name Type Description Default
analyse False

Returns:

Type Description
Source code in src/nlpertools/wrapper.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def fn_timer(async_func=False, analyse=False):
    """
    >>> @fn_timer()
    >>> def example():
    >>>     time.sleep(2)
    :param analyse:
    :return:
    """

    def wrapper(func):
        async def func_time_async(*args, **kwargs):
            t0 = time.time()
            result = await asyncio.create_task(func(*args, **kwargs))
            t1 = time.time()
            print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
            return result

        def func_time(*args, **kwargs):
            t0 = time.time()
            result = func(*args, **kwargs)
            t1 = time.time()
            print('[finished {func_name} in {time:.2f}s]'.format(func_name=func.__name__, time=t1 - t0))
            return result

        def func_time_analyse(*args, **kwargs):
            from pyinstrument import Profiler

            profiler = Profiler()
            profiler.start()

            result = func(*args, **kwargs)

            profiler.stop()
            profiler.print()
            return result

        if async_func is True:
            return func_time_async
        else:
            if analyse:
                return func_time_analyse
            else:
                return func_time

    return wrapper

fn_try(parameter)

该函数把try...catch...封装成装饰器, 接收一个字典参数,并把其中的msg字段改为具体报错信息

Parameters:

Name Type Description Default
parameter

{"msg": "", etc.}

required

Returns:

Type Description

parameter: {"msg": 内容填充为具体的报错信息, etc.}

Source code in src/nlpertools/wrapper.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def fn_try(parameter):
    """
    该函数把try...catch...封装成装饰器,
    接收一个字典参数,并把其中的msg字段改为具体报错信息
    :param parameter: {"msg": "", etc.}
    :return: parameter: {"msg": 内容填充为具体的报错信息, etc.}
    """

    def wrapper(function):
        def inner(*args, **kwargs):
            try:
                result = function(*args, **kwargs)
                return result
            except Exception as e:
                msg = "报错!"
                print('[func_name: {func_name} {msg}]'.format(func_name=function.__name__, msg=msg))
                parameter["msg"] = parameter["msg"].format(str(e))
                return parameter
            finally:
                pass

        return inner

    return wrapper