共现矩阵的一种实现方法，比较笨重，时间复杂度较高

xiaoxiao2022-07-06 196

共现矩阵

为了以某一个词为中心预测周围词，提供一定的概率分布

import matplotlib.pyplot as plt import numpy as np from matplotlib import offsetbox from sklearn.decomposition import TruncatedSVD def read_corpus(category='crude'): files = reuters.fileids(category) return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files] def distinct_words(corpus): """ 去重 :param corpus: 句子 :return: 去重后的列表，长度 """ # 将各个句子放在一个列表中 corpus_words = [] for sentence in corpus: corpus_words.extend(sentence) # 去重后的句子 de_weighting = sorted(list(set(corpus_words))) num_corpus_words = len(de_weighting) return de_weighting, num_corpus_words, corpus_words def compute_co_occurrence_matrix(corpus, window_size=4): """ 共现矩阵 :param corpus: 句子 :param window_size: 中心词，例如：i like nlp. like为中心词，window_size=1 则视i和nlp为临近词 :return: """ de_weighting, num_corpus_words, corpus_words = distinct_words(corpus) word2int = dict(zip(de_weighting, [i for i in range(num_corpus_words)])) # 初始化共现矩阵 m = np.zeros((num_corpus_words, num_corpus_words)) # 表示当前中心词的位置 j = 0 for word in corpus_words: word_l = word2int[word] # 以当前中心词为中心，以window_size为半径，两侧的临近词 for i in range(-window_size, window_size + 1): try: word_pre = corpus_words[j + i] word_pre_l = word2int[word_pre] m[word_l, word_pre_l] += 1 except IndexError: continue j += 1 # 将对角线上的元素赋值为0 ist = [i for i in range(10)] m[ist, ist] = 0 # 将句子的分隔符'START.....END START .... END START...' # START 与 END临近的矩阵点赋值为0 m[word2int['END'], word2int['START']] = 0 m[word2int['START'], word2int['END']] = 0 return m, word2int def reduce_to_k_dim(m, k=2): n_iters = 10 m_reduced = TruncatedSVD(n_components=k, n_iter=n_iters) m_reduced = m_reduced.fit_transform(m) return m_reduced def plot_embeddings(x, word2int, words): x_min, x_max = np.min(x, 0), np.max(x, 0) x = (x - x_min) / (x_max - x_min) for word, i in word2int.items(): if word in words: plt.plot(x[i, 0], x[i, 1], 'x', color='red') plt.text(x[i, 0], x[i, 1], word) if __name__ == '__main__': reuters_corpus = read_corpus() M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus) M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2) # Rescale (normalize) the rows to make them each of unit-length M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1) M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela'] plot_embeddings(M_normalized, word2Ind_co_occurrence, words)

最新回复(0)