免费A级毛片无码专区网站-成人国产精品视频一区二区-啊 日出水了 用力乖乖在线-国产黑色丝袜在线观看下-天天操美女夜夜操美女-日韩网站在线观看中文字幕-AV高清hd片XXX国产-亚洲av中文字字幕乱码综合-搬开女人下面使劲插视频

文本挖掘與NLP筆記——代碼向:分詞

分詞:jieba.cut
words = jieba.cut("我來到北京大學(xué)",cut_all=True)print('全模式:'+'/'.join([w for w in words])) #全模式words = jieba.cut("我來到北京大學(xué)",cut_all=False)print('精確模式:'+'/'.join([w for w in words])) #精確模式,默認(rèn)words = jieba.cut_for_search("小明畢業(yè)于北京大學(xué),后在美國哈佛大學(xué)深造")print('/'.join([w for w in words])) #搜索引擎模式,在精確模式的基礎(chǔ)上,對(duì)長(zhǎng)詞在此劃分全模式:我/來到/北京/北京大學(xué)/大學(xué)精確模式:我/來到/北京大學(xué)
請(qǐng)練習(xí)添加自定義詞典
【文本挖掘與NLP筆記——代碼向:分詞】詞性:jieba.posseg
import jieba.posseg as pgfor word, flag in pg.cut("你想去學(xué)校填寫學(xué)生寒暑假住校申請(qǐng)表嗎?"):print('%s %s' % (word, flag))'你/學(xué)校/填寫/學(xué)生/寒暑假/住校/申請(qǐng)表'
分詞引入停用詞
import jiebaimport pandas as pdimport numpy as nppaths = '中英文停用詞.xlsx'dfs = pd.read_excel(paths,dtype=str)stopwords = ['想','去','嗎','?']words = jieba.cut("你想去學(xué)校填寫學(xué)生寒暑假住校申請(qǐng)表嗎?")'/'.join([w for w in words if (w not in stopwords)])#此處’/'表示換行'你/學(xué)校/填寫/學(xué)生/寒暑假/住校/申請(qǐng)表'
txt轉(zhuǎn)dataframe函數(shù)
import randomimport jieba.posseg as pgimport pandas as pdimport numpy as npdef generatorInfo(file_name):# 讀取文本文件with open(file_name, encoding='utf-8') as file:line_list = [k.strip() for k in file.readlines()]data = https://www.huyubaike.com/biancheng/[]for k in random.sample(line_list,1000):t = k.split(maxsplit=1)#data_label_list.append(t[0])#data_content_list.append(t[1])data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])return datafile_name = 'cnews.train.txt'df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['類別','分詞'])path = '訓(xùn)練集分詞結(jié)果(隨機(jī)選取1000個(gè)樣本).xlsx'df.to_excel(path,index=False)df
文本挖掘與NLP筆記——代碼向:分詞

文章插圖
詞云圖:wordcloud
%pylab inlineimport matplotlib.pyplot as pltfrom wordcloud import WordCloudtext = ' '.join(list(df['分詞']))wcloud = WordCloud(font_path='simsun.ttc', #字體路徑background_color='white', #指定背景顏色max_words=500,#詞云顯示最大詞數(shù)max_font_size=150,#指定最大字號(hào)#mask = mask #背景圖片) wcloud = wcloud.generate(text)#生成詞云plt.imshow(wcloud)plt.axis('off')plt.show()
文本挖掘與NLP筆記——代碼向:分詞

文章插圖
提取關(guān)鍵詞:jieba.analyse.extract_tags
import jieba.analyseimport pandas as pdimport numpy as nppath = '訓(xùn)練集分詞結(jié)果(隨機(jī)選取1000個(gè)樣本).xlsx'df = pd.read_excel(path,dtype=str)s = ' '.join(list(df['分詞']))for w,x in jieba.analyse.extract_tags(s,withWeight=True):print('%s %s' % (w,x))
文本挖掘與NLP筆記——代碼向:分詞

文章插圖
請(qǐng)練習(xí)基于TextRank算法抽取關(guān)鍵詞import jieba.analyseimport pandas as pdimport numpy as nppath = '訓(xùn)練集分詞結(jié)果(隨機(jī)選取1000個(gè)樣本).xlsx'df = pd.read_excel(path,dtype=str)tag = list(set(list(df['類別'])))for t in tag:s = ' '.join(list(df[df['類別']==t]['分詞']))print(t)for w,x in jieba.analyse.extract_tags(s,withWeight=True):print('%s %s' % (x,w))
文本挖掘與NLP筆記——代碼向:分詞

文章插圖
構(gòu)建詞向量
構(gòu)建詞向量簡(jiǎn)單的有兩種分別是TfidfTransformer和 CountVectorizer
#CountVectorizer會(huì)將文本中的詞語轉(zhuǎn)換為詞頻矩陣from sklearn.feature_extraction.text import CountVectorizerpath = '訓(xùn)練集分詞結(jié)果(隨機(jī)選取1000個(gè)樣本).xlsx'df = pd.read_excel(path,dtype=str)corpus = df['分詞']#vectorizer = CountVectorizer(max_features=5000)vectorizer = CountVectorizer()X = vectorizer.fit_transform(corpus)print(X)

經(jīng)驗(yàn)總結(jié)擴(kuò)展閱讀