import pandas as pd
import numpy as np
# 导入数据初始化
data = pd.read_csv('ner_dataset.csv', encoding='latin1' )
data = data.fillna(method='ffill')
data.tail(10)
# 预定义数据结构
words = list(set(data['Word'].values)) # 单词表
print(words[:50])
n_words = len(words) # 查看单词总个数
n_words
from sklearn.base import BaseEstimator, TransformerMixin
class MajorityVotingTagger(BaseEstimator, TransformerMixin):
def fit(self, X, y):
"""
x: list of words
y: list of tags
"""
word2cnt = {}
tags = []
for x, t in zip(X, y):
if t not in tags:
tags.append(t)
if x in word2cnt:
if t in word2cnt[x]:
word2cnt[x][t] += 1
else:
word2cnt[x][t] = 1
else:
word2cnt[x] = {t: 1}
self.mjvote = {}
for k, d in word2cnt.items():
# k : d, d
# Indian: {B_gpe: 4, B_geo:1, ...}
# 每个单词有哪些实体标签,{单词1:{实体名称1:次数, 实体名称2:次数}, 单词2:{实体名称1:次数, 实体名称2:次数}}
self.mjvote[k] = max(d, key=d.get) # 取次数最多的实体名称
def predict(self, X, y = None):
"""
预测内存中的标签, 如果单词是未知的,则预测为O
"""
return [self.mjvote.get(x, 'O') for x in X]
words = data['Word'].values.tolist()
tags = data['Tag'].values.tolist()
print(words[:10], tags[:10], sep = '\n')
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
# 交叉验证
pred = cross_val_predict(estimator = MajorityVotingTagger(), X = words, y = tags, cv = 5)
# 计算验证报告
report = classification_report(y_pred = pred, y_true = tags)
print(report)
本文地址:https://blog.csdn.net/qq_37150711/article/details/107676293
如对本文有疑问, 点击进行留言回复!!
Win10磁盘如何解除BitLocker加密 Win10解除BitLocker加密方法
Win10版本 20H2改进了哪些内容?Win10版本 2020改进内容介绍
win10开机后键盘失灵重启才能使用如何解决 键盘重启后可用的解决方法
Win10更新KB4566782和KB4565351出现0x800f081f错误怎么办?
win10系统备份报错0x8078006b创建共享保护点失败怎么办?
redmi airdots 2蓝牙耳机和redmi airdots青春版哪个好
win10如何关闭数据执行保护 win10关闭数据执行保护图文教程
网友评论