当前位置: 移动技术网 > IT编程>开发语言>.net > 训练中文词向量

训练中文词向量

2020年07月07日  | 移动技术网IT编程  | 我要评论

预处理

将dat转换为txt文件

import chardet
import codecs
import re
 
# detect file encode type 
file_path = '/home/ricardo/out/news_sohusite_xml.dat'
# read file
f2 = codecs.open(file_path, encoding='GB2312', errors="ignore")
content2 = f2.read();
f2.close()
# write to text file
f = codecs.open('/home/ricardo/out/news_sohusite_xml.txt', 'w',encoding='utf8');
# exact the text between <content> and  </content>
a = re.findall('<content>.*</content>', content2)
print("Length of list: %d" % len(a))
i = 0;
for item in a:
    b = item.replace('<content>','');
    b = b.replace('</content>','');
    f.write(str(b)+'\n');
    i = i+1;
    if i%1000 == 0:
        print("index: %d / %d" % (i,len(a)));
    
f.close();

去停用词、分词

import jieba
jieba.enable_parallel()
# 创建停用词列表
def stopwordslist():
    stopwords = [line.strip() for line in open('/home/ricardo/stopwords/hit_stopwords.txt',encoding='UTF-8').readlines()]
    return stopwords

# 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    print("正在分词")
    sentence_depart = jieba.cut(sentence.strip())
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

# 给出文档路径
filename = "/home/ricardo/out/1.txt"
outfilename = "/home/ricardo/outout.txt"
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')

# 将输出结果写入ou.txt中
for line in inputs:
    line_seg = seg_depart(line)
    outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

训练

from gensim.models import word2vec
import multiprocessing
 
def train_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5):
    w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count())
    return w2vModel
 
def save_wordVectors(w2vModel,word2vec_path):
    w2vModel.save(word2vec_path)
 
def load_wordVectors(word2vec_path):
    w2vModel = word2vec.Word2Vec.load(word2vec_path)
    return w2vModel
 
if __name__=='__main__':
 
    # 若只有一个文件,使用LineSentence读取文件
    sentences = word2vec.LineSentence('/home/ricardo/out.txt')
 
    # 若存在多文件,使用PathLineSentences读取文件列表
 
    #segment_dir='/words/'
    #sentences = word2vec.PathLineSentences(segment_dir)
 
    # 一般训练,设置以下几个参数即可:
    word2vec_path='/home/ricardo/word2Vec.model'
    model2=train_wordVectors(sentences, embedding_size=128, window=5, min_count=5)
    save_wordVectors(model2,word2vec_path)
    model2=load_wordVectors(word2vec_path)
    print(model2.wv.similarity('你好', '您好'))

本文地址:https://blog.csdn.net/Ricardo98/article/details/107157024

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网