0%

使用python对淘宝评论进行情感分析并绘制词云

简介

针对单个商品爬取所有用户评论,然后对评论信息进行数据清洗,最后使用百度的深度学习中文情感分析工具Senta进行情感分析,使得所有的好评、差评、好评率、好评与差评使用比例饼图展现。

爬取数据

因为要使用selenium操纵Google Chrome浏览器,所以要将chromedriver放到usr/local/bin目录下。

我选择的商品是 三只松鼠_香酥小黄鱼96g

target

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# 爬取评论

from selenium import webdriver
from time import sleep
import csv
import random

# chromeDriver 在 /usr/local/bin
driver = webdriver.Chrome()
# 三只松鼠_香酥小黄鱼96g商品链接
url='https://detail.tmall.com/item.htm?spm=a230r.1.14.6.4c207c75wjrvsJ&id=551835141835&cm_id=140105335569ed55e27b&abbucket=9&skuId=4069566081270'
driver.get(url)
driver.maximize_window()
sleep(2)
driver.execute_script("window.scrollBy(0, 768)","") # 页面向下滚动768px
# 在点击之前多sleep一会,考虑网速原因,如果在按钮未加载出来之前click程序就会报错终止。
sleep(5)
comments_li = driver.find_element_by_id('J_TabBar').find_elements_by_tag_name('li')[1].click() # 找到评论按钮所在的li标签
sleep(2)

with open('original_data','w',encoding='utf-8',newline='') as opf:
writer = csv.writer(opf)
title=['user_id','comments']
writer.writerow(title)
count = 1
# 天猫只能显示99页的评论
for j in range(100):
# 防止被轻易检测出是爬虫,每次睡眠时间都不一样
sleep(2 + random.random())
for i in range(3): # 网页下拉三次,确保数据加载完成
driver.execute_script("window.scrollBy(0, 800)", "") # 页面向下滚动800px
sleep(1 + random.random())
sleep(random.choice([3, 4, 5]) + random.random())
parent_div = driver.find_element_by_id('J_Reviews').find_element_by_class_name('rate-grid')
all_trs = parent_div.find_elements_by_tag_name('tr')
for tr in all_trs:
data = []
data.append("user_%d" % count)
comments=' '
all_comments = tr.find_elements_by_class_name('tm-rate-fulltxt')
for i, comment in enumerate(all_comments):
if i == 0:
comments = comments + comment.text.strip()
else:
comments = comments + '\n 追加评论: ' + comment.text.strip()
data.append(comments)
writer.writerow(data)
count += 1
driver.find_element_by_class_name('rate-paginator').find_elements_by_tag_name('a')[-1].click()
sleep(random.choice([4, 5, 6]) + random.random())
driver.delete_all_cookies()

为了不给淘宝系统添加负担,每操作一下都会sleep一会,确保爬取过程顺畅。爬取过程中如果遇见登录窗口,一定要及时关闭,防止爬虫中断。

因为淘宝只显示99页评论,所以对于单个商品只爬到了2000条评论。

original_data

清洗数据

很容易就可以发现上述数据有很多是没有用处的,并且妨碍情感分析的。比如 追加评论:此用户没有填写评论! ,这些明显没用的要删掉。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# 将爬到的评论进行清洗

import numpy as np
import pandas as pd

df = pd.read_csv('original_data.csv', names=['user_id', 'content'], header=0)


# 去重
cnt = df.duplicated().value_counts()

# 删除店家的回复,店家回复的格式: "解释:..."
comment_split = df['content'].str.split('解释').apply(pd.Series)

# 只取第一列作为有效数据
df['content'] = comment_split[0]

df['content'] = df['content'].str.replace('追加评论:', '')
df['content'] = df['content'].str.replace('此用户没有填写评论!', '')

# df.info()
df.to_csv('new_data.csv')

# from snownlp import SnowNLP
# comments = df.iloc[:, 1]
# comments = comments[comments.apply(len)>=10]
#
#
# good_counter = 0 # 好评数
# just_so_so_counter = 0 # 中评
# bad_counter = 0 #差评数
# sum = 0 # 总评分
# cnt = 0
# for item in comments:
# sentiment = SnowNLP(str(item)).sentiments
# sum += sentiment
# if sentiment > 0.8:
# good_counter += 1
# elif sentiment > 0.4:
# just_so_so_counter += 1
# else:
# bad_counter += 1
# print("%d %f %s" % (cnt, sentiment, item))
# cnt += 1
# print("good: %d, just_so_so: %d, bad: %d, average : %f"
# %(good_counter, just_so_so_counter, bad_counter, sum/len(comments)))

情感分析

刚开始纠结用哪个框架来分析,在比较了SnowNLPsenta_bilstm之后senta_bilstm完胜,毕竟是百度的。

比如下面这句话,明明是好评,使用SnowNLP分析成了差评(越接近1越积极,越接近0越消极)。

snownlp

再看paddelhubsenta_bilstm的效果。

上面才是正确的情感分析应该显示出来的,百度完胜!

下面开始正式使用senta_bilstm进行情感分析。

analysis_0

我对差评比较感兴趣,所以单独把差评打印出来了一份。

analysis_1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import paddlehub as hub
import json
import os
import six
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm


plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # 更换字体
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号


senta = hub.Module(name="senta_bilstm")

text = []
df = pd.read_csv('new_data.csv')
pos = 0

for i in range(len(df)):
text.append(df.loc[i, 'content'])
dict = {"content": text}

input_dict = {"text": text}

results = senta.sentiment_classify(data=input_dict)

for index, text in enumerate(text):
results[index]["text"] = text

good = 0
bad = 0
for index, result in enumerate(results):
if six.PY2:
print(json.dumps(results[index], encoding="utf8", ensure_ascii=False))
else:
tag = results[index]['sentiment_key']
# print("\033[42m %s \033[0m" % results[index]['sentiment_key'])
if tag == 'positive':
print("%4d \033[41m 好评 \033[0m " % index, end='')
good += 1
else:
print("%4d \033[43m 差评 \033[0m " % index, end='')
bad += 1
print("\033[42m %f \033[0m %s" %(results[index]['positive_probs'], results[index]['text']))
print("好评: %d, 差评: %d 好评率: %.2f%s" % (good, bad, 100 * good/(good + bad), "%"))

plt.title('好评差评比例饼图', fontsize=24)
label = ['好评', '差评']
explode = [0.01, 0.01]
patches, texts, autotexts = plt.pie([good, bad], explode=explode, labels=label, autopct='%1.1f%%')
proptease = fm.FontProperties()
proptease.set_size(20)
plt.setp(autotexts, fontproperties=proptease)
plt.setp(texts, fontproperties=proptease)
plt.show()

绘制词云

使用的WordCloud绘制词云,背景随便找一个图片就行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from os import path
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib as mpl
mpl.use("TkAgg")
import matplotlib.pyplot as plt
import os

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Read the whole text.
text = ''
df = pd.read_csv('new_data.csv')

for i in range(len(df)):
text += df.loc[i, 'content']

# read the mask / color image taken from # http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
Adele_coloring = np.array(Image.open("head.png"))
stopwords = set(STOPWORDS)
stopwords.add("said")

font = '/Library/Fonts/Arial Unicode.ttf'

wc = WordCloud(font_path=font,
width=1200,
height=1200,
mask=Adele_coloring).generate(text)

# create coloring from image
image_colors = ImageColorGenerator(Adele_coloring)

# show
plt.axis("off")
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.show()
wc.to_file('show_English.png')

参考