import numpy as np
import pandas as pd
data = pd.read_csv('ISEAR.csv', header=None)
data.head()
# 标签 内容
# 最经典的过程
from sklearn.model_selection import train_test_split
labels = data[0].values.tolist()
sents = data[1].values.tolist()
X_train, X_test, y_train, y_test = train_test_split(sents, labels,
test_size = 0.2,
random_state = 42)
# 提取tf-id特征
# 对训练数据和测试数据的转化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)# fit(用来训练)+transform函数
X_test = vectorizer.transform(X_test)# 不要写成上面的fit_transform,测试数据不要训练
- 词性的特征
- n-gram(tf-id是unigram)
# 训练的过程(逻辑回归)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
parameters = {'C': [0.0001, 0.0001, 0.001, 0.005, 0.01, 0.05, # 参数验证
0.1, 0.5, 1.2, 5, 10]}
lr = LogisticRegression() # 构建模型
lr.fit(X_train, y_train).score(X_test, y_test)
clf = GridSearchCV(lr, parameters, cv = 5) # 交叉验证:数据分为五个模块
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
print(clf.best_params_)
混淆矩阵D:查看问题出在哪,Dij 表示第i类数据错分为第j类数据的数量(其中i≠j)。
# 混淆矩阵(多分类问题)
from sklearn.metrics import confusion_matrix # 混淆矩阵
confusion_matrix(y_test, clf.predict(X_test))
本文地址:https://blog.csdn.net/qq_37150711/article/details/107439827
如对本文有疑问, 点击进行留言回复!!
听课笔记--Python数据分析--Numpy基础及基本应用
Python-定时任务APScheduler中两种调度器的区别
网友评论