maxlen = 1000 word_freqs = collections.Counter() # word_freqs = {} # print(word_freqs) with open('../data/NLP_data/news.txt', 'r+', encoding='utf8') as f: for line in f: words = line.lower().split(' ') if len(words) > maxlen: maxlen = len(words)
for word in words: ifnot (word in stop_words): word_freqs[word] += 1 # 詞頻統(tǒng)計(jì) # count = word_freqs.get(word, 0) # print(count) # word_freqs[word] = count + 1
from sklearn.metrics.pairwise import cosine_similarity # 比較最后一句與其他句子的相似度 print(cosine_similarity(tfidf[-1], tfidf[:-1], dense_output=False))
這里需要注意的是sklearn計(jì)算TF-IDF公式有些許區(qū)別:
手動(dòng)實(shí)現(xiàn)TF-IDF完整代碼:
注意 :分子分母同時(shí)增加1 為了平滑處理、增加了歸一化處理計(jì)算平方根,。
# coding=utf-8 import math import numpy
corpus = [ 'what is the weather like today', 'what is for dinner tonight', 'this is a question worth pondering', 'it is a beautiful day today' ] words = [] # 對(duì)corpus分詞 for i in corpus: words.append(i.split())
# 進(jìn)行詞頻統(tǒng)計(jì) defCounter(word_list): wordcount = [] for i in word_list: count = {} for j in i: ifnot count.get(j): count.update({j: 1}) elif count.get(j): count[j] += 1 wordcount.append(count) return wordcount
for i in wordcount: tf_idfs = 0 print('part:{}'.format(p)) p = p + 1 for j, k in i.items(): print('word: {} ---- TF-IDF:{}'.format(j, tfidf(j, i, wordcount)))
# 歸一化 tf_idfs += (tfidf(j, i, wordcount) ** 2)
tf_idfs = tf_idfs ** 0.5 print(tf_idfs)
for j, k in i.items(): print('歸一化后:word: {} ---- TF-IDF:{}'.format(j, tfidf(j, i, wordcount) / tf_idfs))
# break
'''
part:1 word: what ---- TF-IDF:0.04794701207529681 word: is ---- TF-IDF:-0.03719059188570162 word: the ---- TF-IDF:0.11552453009332421 word: weather ---- TF-IDF:0.11552453009332421 word: like ---- TF-IDF:0.11552453009332421 word: today ---- TF-IDF:0.04794701207529681 part:2 word: what ---- TF-IDF:0.05753641449035617 word: is ---- TF-IDF:-0.044628710262841945 word: for ---- TF-IDF:0.13862943611198905 word: dinner ---- TF-IDF:0.13862943611198905 word: tonight ---- TF-IDF:0.13862943611198905 part:3 word: this ---- TF-IDF:0.11552453009332421 word: is ---- TF-IDF:-0.03719059188570162 word: a ---- TF-IDF:0.04794701207529681 word: question ---- TF-IDF:0.11552453009332421 word: worth ---- TF-IDF:0.11552453009332421 word: pondering ---- TF-IDF:0.11552453009332421 part:4 word: it ---- TF-IDF:0.11552453009332421 word: is ---- TF-IDF:-0.03719059188570162 word: a ---- TF-IDF:0.04794701207529681 word: beautiful ---- TF-IDF:0.11552453009332421 word: day ---- TF-IDF:0.11552453009332421 word: today ---- TF-IDF:0.04794701207529681
with gzip.open (data_file, 'rb') as f: for i,line in enumerate (f): print(line) break
# 讀取 OpinRank 語(yǔ)料庫(kù),,並作前置處理 defread_input(input_file): with gzip.open (input_file, 'rb') as f: for i, line in enumerate (f): # 前置處理 yield gensim.utils.simple_preprocess(line)
# 關(guān)鍵詞萃取(Keyword Extraction) # https:///gensim_3.8.3/summarization/keywords.html # from gensim.summarization import keywords
# # 測(cè)試語(yǔ)料 # text = '''Challenges in natural language processing frequently involve # speech recognition, natural language understanding, natural language # generation (frequently from formal, machine-readable logical forms), # connecting language and machine perception, dialog systems, or some # combination thereof.'''
# 關(guān)鍵詞萃取 # print(''.join(keywords(text)))
(6)文檔-向量模型 Doc2vec
Doc2vec模型是受到了Word2Vec模型的啟發(fā)。Word2Vec里預(yù)測(cè)詞向量時(shí),,預(yù)測(cè)出來(lái)的詞是含有詞義的,,Doc2vec中也是構(gòu)建了相同的結(jié)構(gòu),,所以Doc2vec克服了詞袋模型中沒有語(yǔ)義的缺點(diǎn)。假設(shè)現(xiàn)在存在訓(xùn)練樣本,,每個(gè)句子是訓(xùn)練樣本,,和Word2Vec一樣,Doc2vec也有兩種訓(xùn)練方式,,一種是分布記憶的段落向量(Distributed Memory Model of Paragraph Vectors , PV-DM)類似于Word2Vec中的CBOW模型,,另一種是分布詞袋版本的段落向量(Distributed Bag of Words version of Paragraph Vector,PV-DBOW)類似于Word2Vec中的Skip-gram模型,。
# coding=utf-8 import numpy as np import nltk import gensim from gensim.models import word2vec from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.metrics.pairwise import cosine_similarity
f = open('../data/FAQ/starbucks_faq.txt', 'r', encoding='utf8') corpus = f.readlines()
# 測(cè)試 questions = [] for i in range(len(document_tokens)): questions.append(model_d2v.infer_vector(document_tokens[i])) questions = np.array(questions) # print(questions.shape)
# 測(cè)試語(yǔ)句 # text = 'find allergen information' # text = 'mobile pay' text = 'verification code' filtered_tokens = tokenize(text, stopword_list) # print(filtered_tokens)