当前位置: 移动技术网 > IT编程>开发语言>.net > cs224n笔记05-探索词向量

cs224n笔记05-探索词向量

2020年07月07日  | 移动技术网IT编程  | 我要评论

1. 加载词向量

def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"):
    """ Load Word2Vec Vectors
        Param:
            embeddings_fp (string) - path to .bin file of pretrained word vectors
        Return:
            wv_from_bin: All 3 million embeddings, each lengh 300
                This is the KeyedVectors format: https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
    """
    embed_size = 300
    print("Loading 3 million word vectors from file...")
    ## 自己下载的文件
    wv_from_bin = KeyedVectors.load_word2vec_format(embeddings_fp, binary=True)
    vocab = list(wv_from_bin.vocab.keys())
    print("Loaded vocab size %i" % len(vocab))
    return wv_from_bin
wv_from_bin = load_word2vec()
print()

2. 降维

def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
    """ Put the word2vec vectors into a matrix M.
        将word2vec向量放入矩阵M中。
        Param:
            wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
            从文件中加载的300万个word2vec向量
        Return:
        
            M: numpy matrix shape (num words, 300) containing the vectors
            M:包含向量的numpy矩阵形状(num字,300)
            word2Ind: dictionary mapping each word to its row number in M
            word2Ind:字典将每个单词映射到它在M中的行号
    """
    import random
    words = list(wv_from_bin.vocab.keys())
    print("Shuffling words ...")
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind

3. 单词类比测试

#man和woman对应king和queen
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

 

 

本文地址:https://blog.csdn.net/z1103757047/article/details/107167967

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网