jieba 分词
pandas
numpy
def mergeData(commit_list_all):
comments = ''
for c in commit_list_all :
comments = comments + (str(c)).strip()
cleaned_comments = ''.join(re.findall(u'[\u4e00-\u9fff]+', comments.decode("utf-8")))
return cleaned_comments
def cleanStopWords(cleaned_comments):
segment = jieba.lcut(cleaned_comments)
words_df=pd.DataFrame({'segment':segment})
stopwords=pd.read_csv("stop_words_zh_UTF-8.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
#print words_stat
return words_stat
"3","都","47"
"4","吴京","35"
"5","不","32"
"6","电影","32"
"7","中国","28"
"9","动作","27"
"8","人","27"
"10","很","26"
"11","还","22"
"12","看","21"
"13","一个","17"
"14","上","15"
"16","大片","14"
"15","战狼","14"
"17","这部","13"
"18","好莱坞","13"
附 获取豆瓣评论并分词存库的代码
# --*-- coding:utf-8 --*--
__author__ = 'licha'
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import spider
from bs4 import BeautifulSoup as bs
import re
import jieba #分词包
import pandas as pd
import numpy #numpy计算包
import database.mysql as db
#获取某页
def getPage(pageNum):
start = pageNum * 20 - 1
url = "https://movie.douban.com/subject/26363254/comments?start={0}&limit=20&sort=new_score&status=P".format(start)
html = spider.gethtml(url)
return html
#spider.cookie = " "
#获取页面中评论列表
def getSubject(html):
commit_list = []
soup = bs(html,"html.parser")
commit_list_p = soup.select(".comment > p")
for comment in commit_list_p:
commit_list.append(comment.get_text())
return commit_list
#获取所有评论数据
def getAllData():
commit_list_all = []
for i in range(0,5):
html = getPage(i)
commit_list = getSubject(html)
commit_list_all.extend(commit_list)
return commit_list_all
#合并所有评论
def mergeData(commit_list_all):
comments = ''
for c in commit_list_all :
comments = comments + (str(c)).strip()
cleaned_comments = ''.join(re.findall(u'[\u4e00-\u9fff]+', comments.decode("utf-8")))
return cleaned_comments
def cleanStopWords(cleaned_comments):
segment = jieba.lcut(cleaned_comments)
words_df=pd.DataFrame({'segment':segment})
stopwords=pd.read_csv("stop_words_zh_UTF-8.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
#print words_stat
return words_stat
def showPic(words_stat):
import matplotlib.pyplot as plt
# %matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) #指定字体类型、字体大小和字体颜色
wordcloud = wordcloud.fit_words(words_stat.head(100).itertuples(index=False))
plt.axis("off")
plt.show()
def saveKeyWord(name,words_stat):
for key,count in words_stat.head(100).itertuples(index=False):
print count,key
sql = "insert into doubankeyword (name,keyword,count)values('{0}','{1}',{2})".format(name,key,count)
db.execute(sql)
#sql = "insert into doubankeyword (name,keyword,count)values('{0}','{1}',{2})".format(name)
def main():
commit_list_all = getAllData()
comments = mergeData(commit_list_all)
words_stat = cleanStopWords(comments)
saveKeyWord("战狼2", words_stat)
#showPic(words_stat)
print comments
if __name__ == "__main__":
print("你好")
main()
#print main()