298
1
# -*- coding: utf-8 -*-
2
"""
3
数据探索 & 主页图像绘制
4
"""
5
import pyecharts
6
import pyecharts.charts as pyc
7
import pyecharts.options as opts
8
import pyecharts.globals as glbs
9
from pyecharts.commons.utils import JsCode
10
11
import os
12
import numpy as np
13
import pandas as pd
14
from sklearn.preprocessing import MinMaxScaler
15
16
deals = pd.read_csv('deals.csv')
17
shops = pd.read_csv('shops.csv')
18
keywords = pd.read_csv('keywords.csv')  # 使用 jieba 对 deals.title 进行分词后得到的词频列表
19
20
def render(chart, temp=True, show=False, changeHost=False):
21
    '''custom render options'''
22
    if temp:
23
        filename = 'temp.html'
24
    else:
25
        try:
26
            filename = chart.options['title'].opts[0]['text'] + '.html'
27
        except:
28
            filename = 'unnamed.html'
29
    chart.render(filename)
30
    if changeHost:
31
        js_0= 'https://cdn.jsdelivr.net/npm/echarts@latest/dist/'
32
        js_1 = 'https://assets.pyecharts.org/assets/'
33
        with open(filename, 'r') as f:
34
            html = f.read().replace(js_0, js_1)
35
        with open(filename, 'w') as f:
36
            f.write(html)
37
    if show:
38
        os.system(f'start {filename}')
39
40
41
# 一、品牌数量排名
42
43
# 取总数大于 10 的店铺
44
counts = shops.title.value_counts()
45
titles = list(counts.loc[counts > 10].index)
46
selected = shops.loc[shops.title.map(lambda x: x in titles)]
47
counts = selected.title.value_counts()
48
# 条形图
49
bar = pyc.Bar(
50
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK)
51
    ).add_xaxis(
52
        list(counts.index)
53
    ).add_yaxis(
54
        '店铺数量', [int(n) for n in counts]    # 不支持 numpy 数据类型
55
    ).set_global_opts(
56
        title_opts=opts.TitleOpts(title="各品牌店铺数量"),
57
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=22)),
58
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=10)
59
)
60
# 叠加饼图
61
labels = list(counts.head(9).index) + ['其他']
62
values = [int(n) for n in counts.head(9)] + [sum(counts) - sum(counts.head(9))]
63
pie = pyc.Pie().add(
64
        series_name='主要数量分布',
65
        data_pair=[tpl for tpl in zip(labels, values)],
66
        center=('75%', '35%'),
67
        radius='25%',
68
        is_clockwise=False
69
    ).set_series_opts(
70
        tooltip_opts=opts.TooltipOpts(is_show=True, trigger='item'),
71
        itemstyle_opts=opts.ItemStyleOpts(border_color="#1a1c1d",opacity=0.8)
72
    )
73
bar.overlap(pie)
74
# 还有一些不知道怎么改的地方直接在 HTML 上改了
75
render(bar, show=True)
76
77
78
# 二、城市数量排名
79
80
# 跟上一个图结构基本一致
81
counts = shops.city.value_counts()
82
bar = pyc.Bar(
83
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK)
84
    ).add_xaxis(
85
        list(counts.index)
86
    ).add_yaxis(
87
        '店铺数量', [int(n) for n in counts]    # 不支持 numpy 数据类型
88
    ).set_global_opts(
89
        title_opts=opts.TitleOpts(title="主要城市店铺数量"),
90
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=30, orient="vertical")
91
    ).reversal_axis()
92
# 叠加饼图
93
labels = list(counts.head(9).index) + ['其他']
94
values = [int(n) for n in counts.head(9)] + [sum(counts) - sum(counts.head(9))]
95
pie = pyc.Pie().add(
96
        series_name='主要数量分布',
97
        data_pair=[tpl for tpl in zip(labels, values)],
98
        center=('80%', '30%'),
99
        radius='25%',
100
        is_clockwise=False
101
    ).set_series_opts(
102
        tooltip_opts=opts.TooltipOpts(is_show=True, trigger='item'),
103
        itemstyle_opts=opts.ItemStyleOpts(border_color="#1a1c1d",opacity=0.8)
104
    )
105
bar.overlap(pie)
106
render(bar, show=True)
107
108
109
# 三、城市热度分值：热度指数 = 店铺数量(50%) + 平均价格(30%) + 品牌丰度(20%)
110
111
# 本图直接完成，没有修改 HTML
112
# 分别计算三个指标并汇总表格
113
counts = shops.city.value_counts()
114
hotRate = pd.DataFrame({'city': list(counts.index), 'shopNum': list(counts)})
115
prices = shops.groupby('city').avgprice.mean()
116
hotRate = hotRate.merge(prices, on='city')
117
richness = shops.groupby(['city', 'title']).id.count().reset_index().city.value_counts()
118
richness = richness.reset_index().rename({'index': 'city', 'city': 'richness'}, axis=1)
119
hotRate = hotRate.merge(richness, on='city')
120
# 归一化后加权
121
weight = [0.5, 0.3, 0.2]    # custom
122
hotRate = pd.concat(
123
    [hotRate.drop(['shopNum', 'avgprice', 'richness'], axis=1),
124
    pd.DataFrame(
125
        MinMaxScaler().fit_transform(hotRate.drop('city', axis=1)) * weight,
126
        columns=['shopNum', 'avgprice', 'richness']
127
    )], axis=1
128
)
129
# 计算总分数并添加省级行政区
130
hotRate['rate'] = hotRate.shopNum + hotRate.avgprice + hotRate.richness
131
provinces = ['上海','广东','浙江','四川','河南','江苏','北京','广东','湖北','广西','山东','安徽','陕西','福建','甘肃','湖南','江西','辽宁','云南','天津','重庆','山西','内蒙古','贵州','吉林','黑龙江','宁夏','青海','海南','河北','新疆','西藏']
132
hotRate['province'] = provinces
133
# 绘图
134
capital = hotRate.drop(7)
135
map = pyc.Map(
136
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='360px', bg_color='#1a1c1d')
137
    ).add(
138
        series_name='茶饮行业热度指数',
139
        maptype='china',
140
        is_map_symbol_show=False,
141
        zoom=1.24,
142
        data_pair=[tpl for tpl in zip(capital.province, [round(i, 2) for i in capital.rate])],
143
        emphasis_itemstyle_opts=opts.ItemStyleOpts(area_color='#4992FF')
144
    ).add(
145
        series_name='店铺数量指数（50%）',
146
        maptype='china',
147
        is_map_symbol_show=False,
148
        zoom=1.24,
149
        data_pair=[tpl for tpl in zip(capital.province, [round(i/weight[0], 2) for i in capital.shopNum])],
150
        emphasis_itemstyle_opts=opts.ItemStyleOpts(area_color='#4992FF')
151
    ).add(
152
        series_name='产品价格指数（30%）',
153
        maptype='china',
154
        is_map_symbol_show=False,
155
        zoom=1.24,
156
        data_pair=[tpl for tpl in zip(capital.province, [round(i/weight[1], 2) for i in capital.avgprice])],
157
        emphasis_itemstyle_opts=opts.ItemStyleOpts(area_color='#4992FF')
158
    ).add(
159
        series_name='品牌丰度指数（20%）',
160
        maptype='china',
161
        is_map_symbol_show=False,
162
        zoom=1.24,
163
        data_pair=[tpl for tpl in zip(capital.province, [round(i/weight[2], 2) for i in capital.richness])],
164
        emphasis_itemstyle_opts=opts.ItemStyleOpts(area_color='#4992FF')
165
    ).set_series_opts(
166
        label_opts=opts.LabelOpts(color='#889099')
167
    ).set_global_opts(
168
        legend_opts=opts.LegendOpts(
169
            orient='vertical',
170
            selected_mode='single',
171
            pos_left='2.5%', pos_bottom='3%',
172
            item_gap=6, item_width=18, item_height=10
173
        ),
174
        visualmap_opts=opts.VisualMapOpts(
175
            min_=0, max_=1, precision=2,
176
            range_color=["#78A8F4", "#3F44A8", "#7D0083"],
177
            range_text=["High", "Low"],
178
            is_calculable=True,
179
            orient='horizontal', pos_left='left', pos_top='2%',
180
            item_width=12, item_height=100
181
        )
182
    )
183
render(map, show=True, changeHost=True)
184
185
186
# 四、主要品牌价格分布
187
188
# 取店铺总数大于 10 的品牌
189
counts = shops.title.value_counts()
190
titles = list(counts.loc[counts > 10].index)
191
def get_price_seq(title, df=shops):
192
    brand_df = df.loc[df.title==title]
193
    filter = brand_df.avgprice.map(lambda x: x > 0)
194
    seq = [float(p) for p in brand_df.loc[filter].avgprice]
195
    return seq
196
data = []
197
for t in titles:
198
    seq = get_price_seq(t)
199
    if len(seq) > 10:
200
        data.append((t, seq, np.median(seq), np.mean(seq)))
201
data = pd.DataFrame(
202
        data, columns=['title', 'price_seq', 'median', 'mean']
203
    ).sort_values(
204
        by='median', ascending=False
205
    )
206
# 箱形图
207
box = pyc.Boxplot(init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='360px', bg_color='#1a1c1d')
208
    ).add_xaxis(xaxis_data=list(data.title)
209
    ).add_yaxis(
210
        series_name='品牌门店均价分布',
211
        y_axis=pyc.Boxplot.prepare_data(list(data.price_seq)),
212
        itemstyle_opts=opts.ItemStyleOpts(color='#1a1c1d')
213
    ).set_global_opts(
214
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=22)),
215
        datazoom_opts=opts.DataZoomOpts(
216
            range_start=0, range_end=30, orient="horizontal")
217
    )
218
line = pyc.Line(init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='360px', bg_color='#1a1c1d')
219
    ).add_xaxis(list(data.title)
220
    ).add_yaxis(
221
        series_name='平均值',
222
        y_axis=[int(m) for m in data['mean']],
223
    ).set_series_opts(
224
        label_opts=opts.LabelOpts(is_show=False)
225
    )
226
box.overlap(line)
227
render(box, show=True)
228
229
230
# 五、产品关键词
231
232
# print('\n'.join(list(keywords.words.value_counts().index)))
233
keywords['words'] = keywords.words.map(lambda x: '百香果' if x=='百香' else x)
234
# 进一步去除无关词
235
drop_list = ['中杯', '叠加', '使用', '饮品', '兄弟', '招牌', '份莓莓', '书亦烧', '系列', '建议',
236
            '单人', '套餐', '满杯', '双人', '大叔', '小姐姐', '网红', '快乐', '经典', '不知',
237
            '原味', '特饮', '人气', '热饮', '特色', '必点', '双重', '热销']
238
counts = keywords.loc[keywords.words.map(lambda x: x not in drop_list)].words.value_counts()
239
words = list(zip(list(counts.index), [int(n) for n in counts]))
240
wc = pyc.WordCloud(
241
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='360px', bg_color='#1a1c1d')
242
    ).add(
243
        '', words, word_size_range=(8, 88), shape=glbs.SymbolType.DIAMOND
244
    )
245
render(wc, show=True, changeHost=True)
246
247
248
# 六、品牌销量与评分排行（销量数据来源不准确，仅供参考）
249
250
# 取店铺总数大于 10 的品牌
251
counts = shops.title.value_counts()
252
titles = list(counts.loc[counts > 10].index)
253
selected = shops.loc[shops.title.map(lambda x: x in titles)]
254
# 删除缺失值和零值
255
sale_score = selected.dropna()[['title', 'avgscore', 'avgsales']]
256
sale_score = sale_score.loc[sale_score.avgsales.map(lambda x: x > 0)]
257
sale_score = sale_score.loc[sale_score.avgscore.map(lambda x: x > 0)]
258
# 分组聚合并按评分均值排序
259
sale_score = sale_score.groupby(by='title').mean().sort_values(by='avgscore', ascending=False).reset_index()
260
# 绘图
261
itemColorJS = '''
262
    new echarts.graphic.RadialGradient(
263
        0.4, 0.3, 0.6,
264
        [{offset: 0, color: 'rgba(111, 152, 232, 1)'},
265
        {offset: 1, color: 'rgba(36, 54, 76, 1)'}
266
        ]
267
    )
268
'''
269
symbolSizeJS = 'function (data) {return 2**data[2]*2;}' # 这里进行了对数换底，改善可视化效果，也保留数据的指数趋势
270
tooltipJS = '''
271
    function (param) {
272
        var line1 = '☕ ' + param.data[0] + '<br/>';
273
        var line2 = '门店评分均值：' + param.data[1] + '<br/>';
274
        var line3 = '产品销量均值：' + Math.ceil(10**param.data[2]);
275
        return line1 + line2 + line3;
276
    }
277
'''
278
bubble = pyc.Scatter(
279
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='360px', bg_color='#1a1c1d')
280
    ).add_xaxis(
281
        list(sale_score.title)
282
    ).add_yaxis(
283
        '评分均值',
284
        [[round(sale_score.avgscore[i], 2), np.log10(sale_score.avgsales[i])] for i in sale_score.index],
285
        label_opts=opts.LabelOpts(is_show=False),
286
        itemstyle_opts=opts.ItemStyleOpts(color=JsCode(itemColorJS)),
287
        symbol_size=JsCode(symbolSizeJS)
288
    ).set_global_opts(
289
        yaxis_opts=opts.AxisOpts(
290
            name='评分均值', offset=5, type_="value", is_scale=True, min_interval=0.1,
291
            splitline_opts=opts.SplitLineOpts(is_show=True, linestyle_opts=opts.LineStyleOpts(type_='dashed'))
292
        ),
293
        legend_opts=opts.LegendOpts(is_show=False),
294
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=10, orient='horizontal'),
295
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=25)),
296
        tooltip_opts=opts.TooltipOpts(formatter=JsCode(tooltipJS))
297
    )
298
render(bubble, show=True)