238
1
# -*- coding: utf-8 -*-
2
"""
3
品牌聚类分析
4
"""
5
import os
6
import time
7
import numpy as np
8
import pandas as pd
9
import matplotlib.pyplot as plt
10
11
import pyecharts.charts as pyc
12
import pyecharts.options as opts
13
import pyecharts.globals as glbs
14
from city_grouping import render, write_js
15
16
from sklearn import preprocessing as ppcs
17
from sklearn.cluster import KMeans
18
from sklearn.metrics import silhouette_score
19
20
# custom package from <https://github.com/paradiseeee/Heaven>
21
from Heaven import mpl_tools
22
mpl_tools.Begin().setting()
23
24
# 可视化数据分布
25
def plot_features(df, title='', size=2):
26
    plt.figure(figsize=(10,5))
27
    for col in df.columns:
28
        plt.scatter(df.index, df[col], s=size, label=col, alpha=0.6)
29
    plt.legend(loc='upper right', markerscale=6)
30
    plt.title(title)
31
    plt.show()
32
33
# 提取特征
34
shops = pd.read_csv('shops.csv')
35
brands = shops[['title', 'avgprice', 'avgscore']]
36
brands = brands.groupby('title').mean().reset_index()
37
brands.avgprice = brands.avgprice.map(lambda x: round(x, 2))
38
brands.avgscore = brands.avgscore.map(lambda x: round(x, 2))
39
brands['counts'] = brands.title.map(dict(shops.title.value_counts()))
40
41
features = brands.drop('title', axis=1)
42
plot_features(features, 'Features before scaling')
43
44
# 特征缩放
45
def features_scaling(features, scaler, return_=False, show=False, method=''):
46
    scaler.fit(features)
47
    features_scaled=pd.DataFrame(scaler.transform(features), columns=features.columns)
48
    if show:
49
        plot_features(features_scaled, title='Features after ' + method)
50
    if return_:
51
        return (scaler, features_scaled)
52
53
# 各种特征缩放组合
54
def get_feat_dict(norm='l2'):
55
    _, scaled1 = features_scaling(features, ppcs.MinMaxScaler(), return_=True)
56
    _, scaled2 = features_scaling(features, ppcs.StandardScaler(), return_=True)
57
    _, scaled3 = features_scaling(features, ppcs.Normalizer(norm=norm), return_=True)
58
    _, scaled1_2 = features_scaling(scaled1, ppcs.StandardScaler(), return_=True)
59
    _, scaled2_3 = features_scaling(scaled2, ppcs.Normalizer(norm=norm), return_=True)
60
    _, scaled1_2_3 = features_scaling(scaled1_2, ppcs.Normalizer(norm=norm), return_=True)
61
    feat_dict = {
62
        'minmax': scaled1, 'standard': scaled2, 'normalize': scaled3,
63
        'minmax_standard': scaled1_2,
64
        'standard_normalize': scaled2_3,
65
        'minmax_standard_normalize': scaled1_2_3
66
    }
67
    return feat_dict
68
69
# 聚类-单个模型
70
def clustering(features, N, show=False):
71
    km = KMeans(n_clusters=N)
72
    km.fit(features)
73
    features['labels'] = km.labels_
74
    if show:
75
        for i in np.unique(km.labels_):
76
            plot_features(features.loc[features.labels==i], f'class-{i}')
77
    features.drop('labels', axis=1, inplace=True)   # 这里一定要inplace而不是对自身赋值，不然不会改变全局变量，而是产生一个copy，需要显式return
78
    return km
79
80
# 获取各类的数据观测
81
def get_clusters(model, feat=brands):
82
    feat['labels'] = model.labels_
83
    clusters = []
84
    for i in range(model.n_clusters):
85
        df = feat.loc[feat.labels==i].drop('labels', axis=1)
86
        clusters.append(df)
87
    feat.drop('labels', axis=1, inplace=True)
88
    return clusters
89
90
# 对应原始数据的聚类中心
91
def get_original_centers(model, brands=brands):
92
    clusters = get_clusters(model, feat=brands)
93
    centers = pd.DataFrame([df.drop('title', axis=1).mean() for df in clusters])
94
    return centers
95
96
# 绘制聚类散点图
97
def plot_clusters(km, score, feat, method, save_path=None):
98
99
    colors = ['#6D84A5', '#B50A24', '#F49F45', '#D6C7B0', '#73B9BC', '#F0DF7D', '#DA6964', '#6F9F71', '#FFCD41', '#4A393B']
100
    axis_opts = {
101
        "type": "value", 
102
        "axisLabel": {"color": "#DDDEEE"}, 
103
        "nameTextStyle": {"color": "#DDDEEE"},
104
        "axisLine":{"lineStyle":{"color": "#889099"}}, 
105
        "splitLine": {"lineStyle": {"color": "#889099"}}
106
    }
107
    clusters = get_clusters(km, feat=feat)
108
    centers = pd.DataFrame(km.cluster_centers_, columns=['avgprice', 'avgscore', 'counts'])
109
    centers.index = ['No.' + str(idx+1) for idx in centers.index]
110
    original_centers = get_original_centers(km)
111
    original_centers.index = centers.index
112
113
    centers_tb = '\n\n ➤ Cluster Centers: \n\n ' + centers.to_markdown().replace('\n', '\n ').replace(':', '-')
114
    org_centers_tb = '\n\n ➤ Original Cluster Centers: \n\n ' + original_centers.to_markdown().replace('\n', '\n ').replace(':', '-')
115
    bar = '\n\n ➤ Clusters Counts: '
116
    brands_counts = [len(df) for df in clusters]
117
    for i, c in enumerate(centers.index):
118
        bar += f'\n\n [{c}] ' + '▆' * int(brands_counts[i]/150+1) + ' ' + str(brands_counts[i])
119
120
    scatter = pyc.Scatter3D(init_opts=opts.InitOpts(height='700px', width='100%', bg_color='#1a1c1d', theme=glbs.ThemeType.DARK))
121
    for i in range(km.n_clusters):
122
        data = [[float(v) for v in rows] for rows in clusters[i].values]
123
        scatter.add(
124
            f'Cluster No.{i+1}', data,
125
            itemstyle_opts=opts.ItemStyleOpts(color=colors[i]),
126
            xaxis3d_opts=dict(axis_opts, name='avgprice'), 
127
            yaxis3d_opts=dict(axis_opts, name='avgscore'), 
128
            zaxis3d_opts=dict(axis_opts, name='counts'), 
129
            grid3d_opts=opts.Grid3DOpts(width=80, height=80, depth=80)
130
        )
131
    scatter.set_global_opts(
132
        tooltip_opts=opts.TooltipOpts(is_show=False), 
133
        title_opts=opts.TitleOpts(
134
            title=f' [ Scaling method: {method} ]',
135
            subtitle=f' ➤ Best Model: \n\n [ n_clusters = {km.n_clusters}, silhouette_score = {round(score, 4)} ]' + centers_tb + org_centers_tb + bar,
136
            title_textstyle_opts=opts.TextStyleOpts(color='#DDDEEE'), 
137
            subtitle_textstyle_opts=opts.TextStyleOpts(color='#22DDEE', font_size=14 ,font_family='Courier New')
138
        ), 
139
        legend_opts=opts.LegendOpts(pos_right='2%', pos_top='2%', orient='vertical'), 
140
        toolbox_opts=opts.ToolboxOpts(
141
            pos_left=None, pos_right='2%', pos_bottom='2%', orient='vertical', 
142
            feature=opts.ToolBoxFeatureOpts()
143
        )
144
    )
145
    render(scatter, changeHost=True)
146
    write_js('temp.html', '"type": "scatter3D",', '"type": "scatter3D","emphasis": {"itemStyle": {"color": "#00ffff"}},"symbolSize":6,')
147
    write_js('temp.html', '"grid3D": {', '"grid3D": {"axisPointer": {"lineStyle": {"color": "#22DDEE"}},')
148
    name = f'{method}-{km.n_clusters}-{round(score, 4)}.html'
149
    if save_path:
150
        os.rename('temp.html', save_path + '/' + name)
151
    else:
152
        print(name)
153
        input('\n任意键继续...\n')
154
155
# 绘制聚类雷达图
156
def plot_radar(i):
157
    radar = pyc.Radar(
158
        init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width='100%', height='300px', bg_color='#1a1c1d')
159
    ).set_colors([colors[i]]
160
    ).add_schema(
161
        schema=[{"name": name, "max": 1, "min": -1} for name in ['消费水平', '店铺评分', '店铺数量']], 
162
        shape='circle', 
163
        center=["50%", "50%"], 
164
        radius="80%", 
165
        angleaxis_opts=opts.AngleAxisOpts(
166
            axistick_opts=opts.AxisTickOpts(is_show=False),
167
            axislabel_opts=opts.LabelOpts(is_show=False),
168
            axisline_opts=opts.AxisLineOpts(is_show=False),
169
            splitline_opts=opts.SplitLineOpts(is_show=False)
170
        ), 
171
        radiusaxis_opts=opts.RadiusAxisOpts(
172
            min_=-1, max_=1, interval=0.5,
173
            splitarea_opts=opts.SplitAreaOpts(
174
                is_show=True, 
175
                areastyle_opts=opts.AreaStyleOpts(opacity=0.5)
176
            )
177
        ),
178
        polar_opts=opts.PolarOpts(),
179
        splitarea_opt=opts.SplitAreaOpts(is_show=False),
180
        splitline_opt=opts.SplitLineOpts(is_show=False)
181
    ).add(
182
        series_name=f'{names[i]}：{quotes[i]}', 
183
        data=[{'name': f'{names[i]}', "value": [round(v, 2) for v in centers[i]]}], 
184
        areastyle_opts=opts.AreaStyleOpts(opacity=0.2), 
185
        linestyle_opts=opts.LineStyleOpts(width=1)
186
    ).set_global_opts(
187
        legend_opts=opts.LegendOpts(pos_left='2%', pos_bottom='2%', )
188
    )
189
    render(radar)
190
    os.rename('temp.html', f"[modeling]{names[i].replace(' ', '-')}.html")
191
192
193
if __name__ == "__main__":
194
    
195
    # 对所有特征进行聚类并评分，按相同格式保存模型以及评分数据
196
    start = time.time()
197
    feat_dict = get_feat_dict()
198
    score_dict = {k: [] for k in feat_dict.keys()}
199
    model_dict = {k: [] for k in feat_dict.keys()}
200
    RANGE = [2,3,4,5,6]
201
202
    for method, feat in feat_dict.items():
203
        print('-'*60)
204
        for n in RANGE:
205
            print(f'processing {method}-scaled feature with {n} ...')
206
            km = clustering(feat, n)
207
            score = silhouette_score(feat, km.labels_)
208
            score_dict[method].append(score)
209
            model_dict[method].append(km)
210
211
    print(f'------ Total cost {int(time.time()-start)} seconds ------')
212
    pd.DataFrame(score_dict).plot()
213
    plt.show()
214
215
    # 绘制三维散点分布（执行前先清空 save_path）
216
    for method, models in model_dict.items():
217
        feat = feat_dict[method]
218
        for km, score in zip(models, score_dict[method]):
219
            plot_clusters(km, score, feat, method, save_path='./modeling')
220
221
    # 选取模型（事先运行了上面的代码确定选取的模型）
222
    best = model_dict['standard_normalize'][1]
223
    clusters = get_clusters(best)
224
    
225
    # 输出各类数据观测采样
226
    for n, df in enumerate(clusters):
227
        df = df.reset_index(drop=True).sort_values(by='counts', ascending=False).head(20)
228
        data = [dict(df.iloc[i]) for i in range(len(df))]
229
        with open(f'cluster-{n+1}.json', 'w', encoding='utf-8') as f:
230
            f.write(str(data).replace("'", '"').replace('{', '\n{'))
231
232
    # 绘制 best 模型的聚类雷达图
233
    centers = best.cluster_centers_
234
    names = ['Cluster No.1', 'Cluster No.2', 'Cluster No.3']
235
    colors = ['#6D84A5', '#B50A24', '#F49F45']
236
    quotes = ['特征均衡', '低评分', '低消费']
237
    for i in range(best.n_clusters):
238
        plot_radar(i)