这是执行上述任务的简单功能:
import refrom sklearn.feature_extraction.text import ENGLISH_STOP_WORDSfrom nltk.stem.porter import PorterStemmerimport emojiimport stringdef preprocess_text(text, remove_stop = True, stem_words = False, remove_mentions_hashtags = True): """ eg: input: preprocess_text("@water #dream hi hello where are you going be there tomorrow happening happen happens", stem_words = True) output: ['tomorrow', 'happen', 'go', 'hello'] """ # Remove emojis emoji_pattern = re.compile("[" "\U0001F1E0-\U0001F6FF" "]+", flags=re.UNICODE) text = emoji_pattern.sub(r"", text) text = "".join([x for x in text if x not in emoji.UNICODE_EMOJI]) if remove_mentions_hashtags: text = re.sub(r"@(\w+)", " ", text) text = re.sub(r"#(\w+)", " ", text) text = re.sub(r"[^\x00-\x7F]+", " ", text) regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers nopunct = regex.sub(" ", text.lower()) words = (''.join(nopunct)).split() if(remove_stop): words = [w for w in words if w not in ENGLISH_STOP_WORDS] words = [w for w in words if len(w) > 2] # remove a,an,of etc. if(stem_words): stemmer = PorterStemmer() words = [stemmer.stem(w) for w in words] return list(words) 字向量-它们是什么?
使用此表示形式,‘Tomorrow will be a good day’的一天的文本可以编码为:[0,1,1,0,1,1,0,1,0]。 注意单词will将被忽略,因为它根本不存在于词汇表中。 要使此模型正常工作,必须具有良好而广泛的词汇表。 还要注意在此表示形式中如何完全忽略单词关系(出现的顺序,语义关系)。
Word2Vec词嵌入:
这是一些辅助函数,用于加载glove字典,查找质心以及查找质心之间的距离:
mport numpy as np#loading the glove file into a dictionary of wordsdef load_glove(filename): glove_dict = {} with open(filename) as f: file_content = f.readlines() for line in file_content: line_content = line.split() glove_dict[line_content[0]] = np.array(line_content[1:], dtype=float) return glove_dict#get centroid of a particular documentdef get_centroid(text, gloves): words_list = preprocess_text(text) word_vec_sum = 0 words_count = 0 for w in words_list: if w in gloves: word_vec_sum += gloves[w] words_count += 1 if words_count: return word_vec_sum/words_count else: return 0#get distance between two centroidsdef get_distance (a,b): return (np.linalg.norm(a - b))训练从头开始生成单词向量:
代码:
from wordcloud import WordCloudimport matplotlib.pyplot as pltdef make_wc(word_list): wordcloud = WordCloud() wordcloud.fit_words(dict(Counter(word_list).most_common(40))) fig=plt.figure(figsize=(10, 10)) plt.imshow(wordcloud) plt.axis("off") plt.show() make_wc([token.text for token in doc if token.pos_ in ['NOUN']])
输出:
[attach]730531[/attach]
article, talk, and page are the most frequently occurring nouns
情感分析