import chardet
import codecs
import re
# detect file encode type
file_path = '/home/ricardo/out/news_sohusite_xml.dat'
# read file
f2 = codecs.open(file_path, encoding='GB2312', errors="ignore")
content2 = f2.read();
f2.close()
# write to text file
f = codecs.open('/home/ricardo/out/news_sohusite_xml.txt', 'w',encoding='utf8');
# exact the text between <content> and </content>
a = re.findall('<content>.*</content>', content2)
print("Length of list: %d" % len(a))
i = 0;
for item in a:
b = item.replace('<content>','');
b = b.replace('</content>','');
f.write(str(b)+'\n');
i = i+1;
if i%1000 == 0:
print("index: %d / %d" % (i,len(a)));
f.close();
import jieba
jieba.enable_parallel()
# 创建停用词列表
def stopwordslist():
stopwords = [line.strip() for line in open('/home/ricardo/stopwords/hit_stopwords.txt',encoding='UTF-8').readlines()]
return stopwords
# 对句子进行中文分词
def seg_depart(sentence):
# 对文档中的每一行进行中文分词
print("正在分词")
sentence_depart = jieba.cut(sentence.strip())
# 创建一个停用词列表
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
# 给出文档路径
filename = "/home/ricardo/out/1.txt"
outfilename = "/home/ricardo/outout.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
# 将输出结果写入ou.txt中
for line in inputs:
line_seg = seg_depart(line)
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
from gensim.models import word2vec
import multiprocessing
def train_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5):
w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count())
return w2vModel
def save_wordVectors(w2vModel,word2vec_path):
w2vModel.save(word2vec_path)
def load_wordVectors(word2vec_path):
w2vModel = word2vec.Word2Vec.load(word2vec_path)
return w2vModel
if __name__=='__main__':
# 若只有一个文件,使用LineSentence读取文件
sentences = word2vec.LineSentence('/home/ricardo/out.txt')
# 若存在多文件,使用PathLineSentences读取文件列表
#segment_dir='/words/'
#sentences = word2vec.PathLineSentences(segment_dir)
# 一般训练,设置以下几个参数即可:
word2vec_path='/home/ricardo/word2Vec.model'
model2=train_wordVectors(sentences, embedding_size=128, window=5, min_count=5)
save_wordVectors(model2,word2vec_path)
model2=load_wordVectors(word2vec_path)
print(model2.wv.similarity('你好', '您好'))
本文地址:https://blog.csdn.net/Ricardo98/article/details/107157024
如对本文有疑问, 点击进行留言回复!!
《UnityAPI.Particle粒子》(Yanlz+Unity+SteamVR+云技术+5G+AI+VR云游戏+Particle+lifetime+startColor+立钻哥哥++OK++)
8寸扫码三防军工平板手持终端,双色注塑模/康宁大猩猩玻璃/10点电容屏
信创舆情一线--十五部门印发指导意见进一步促进服务型制造发展
网友评论