174
1
# -*- coding: utf-8 -*-
2
"""
3
城市回归分析
4
"""
5
import os
6
import numpy as np
7
import pandas as pd
8
import matplotlib.pyplot as plt
9
​
10
import pyecharts.charts as pyc
11
import pyecharts.options as opts
12
import pyecharts.globals as glbs
13
​
14
from sklearn import preprocessing as ppcs
15
from sklearn import linear_model
16
from sklearn import metrics
17
from sklearn import svm
18
from sklearn import model_selection
19
​
20
from city_grouping import render, write_js
21
from brands_clustering import shops, plot_features, features_scaling, get_feat_dict
22
​
23
# 增加城市宏观经济指标特征
24
economy = pd.read_csv('economy.csv')
25
​
26
# 获取特征
27
cities = shops[['city', 'avgprice', 'avgscore', 'comments', 'latitude', 'longitude']]
28
cities = cities.groupby('city').mean()
29
cities['GDP'] = cities.index.map(dict(zip(economy.city, economy['GDP(2022-Q1-Q2)'])))
30
cities['Pop'] = cities.index.map(dict(zip(economy.city, economy['population(2019)'])))
31
cities['counts'] = cities.index.map(shops.groupby('city')['id'].count())
32
cities = cities.sort_values(by='counts', ascending=False)
33
​
34
def plot_corr_matrix(features=cities, rename=True):
35
    '''绘制特征相关矩阵'''
36
    df = features.copy()
37
    if rename:
38
        df.columns = ['人均消费', '评分均值', '评论均值', '经度中心', '纬度中心', 
39
            '生产总值', '人口数量', '店铺数量']
40
    cm = df.corr()
41
    value = [[i, j, round(cm[x][y], 4)] for i, x in enumerate(cm.index) for j, y in enumerate(cm.columns)]
42
    heatmap = pyc.HeatMap(
43
            init_opts=opts.InitOpts(theme=glbs.ThemeType.DARK, width="360px", height="360px", bg_color='#1a1c1d')
44
        ).add_xaxis(list(cm.index)
45
        ).add_yaxis('相关系数', list(cm.columns), value
46
        ).set_series_opts(
47
            label_opts=opts.LabelOpts(is_show=False)
48
        ).set_global_opts(
49
            legend_opts=opts.LegendOpts(is_show=False), 
50
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)), 
51
            visualmap_opts=opts.VisualMapOpts(
52
                min_=-1, max_=1, precision=2, 
53
                range_color=["#0000FF", "#FFFFFF", "#B50A24"], 
54
                orient='horizontal', pos_top='1%', pos_left='center', 
55
                item_width='12px', item_height='200px'
56
            )
57
        )
58
    render(heatmap)
59
    grid_option = '"grid": {"x": 80, "y": 50, "x2": 30, "y2": 80},\n    '
60
    write_js('temp.html', '"visualMap": {', grid_option + '"visualMap": {')
61
    try:
62
        os.remove('corr.html')
63
    except:
64
        pass
65
    os.rename('temp.html', 'corr.html')
66
​
67
def plot_predict(model, features, labels, title=''):
68
    '''绘制拟合曲线以及残差'''
69
    pred = model.predict(features)
70
    residual = - (labels - pred)
71
    res_up = residual.map(lambda x: x if x >= 0 else None)
72
    res_dw = residual.map(lambda x: x if x < 0 else None)
73
    init_options = opts.InitOpts(theme=glbs.ThemeType.DARK, bg_color='#1a1c1d', width='100%', height='360px')
74
    line = pyc.Line(init_opts=init_options
75
        ).add_xaxis(list(labels.index)
76
        ).add_yaxis('truth', [round(l, 2) for l in labels], symbol_size=6
77
        ).add_yaxis(
78
            'prediction', [round(p, 2) for p in pred], 
79
            itemstyle_opts=opts.ItemStyleOpts(color='#A35300')
80
        ).set_series_opts(label_opts=opts.LabelOpts(is_show=False)
81
        ).set_global_opts(title_opts=opts.TitleOpts(title=title))
82
    bar = pyc.Bar(init_opts=init_options
83
        ).add_xaxis(list(labels.index)
84
        ).add_yaxis(
85
            'residual+', [round(r, 2) for r in res_up], bar_width='60%', 
86
            itemstyle_opts=opts.ItemStyleOpts(color='#DA6964')
87
        ).add_yaxis(
88
            'residual-', [round(r, 2) for r in res_dw], bar_width='60%', 
89
            itemstyle_opts=opts.ItemStyleOpts(color='#6F9F71')
90
        ).set_series_opts(label_opts=opts.LabelOpts(is_show=False)
91
        )
92
    line.overlap(bar)
93
    try:
94
        os.remove(f'{title}.html')
95
    except:
96
        pass
97
    line.render(f'{title}.html')
98
​
99
def print_coefficients(model):
100
    text = f'y = {round(model.intercept_, 4)}'
101
    for i, c in enumerate(model.coef_):
102
        text += f' + {round(c, 4)} · X{i+1}'
103
    print('\n'+'-'*60+'\n' + text + '\n'+'-'*60+'\n')
104
​
105
​
106
if __name__ == "__main__":
107
​
108
    # 测试各种回归方法
109
    selected = ['GDP', 'Pop', 'latitude', 'longitude']
110
    LABEL = 'counts'
111
    features = cities[selected]
112
    labels = cities[LABEL]
113
    plot_corr_matrix(cities[selected+[LABEL]], False)
114
    # 线性回归
115
    lr = linear_model.LinearRegression()
116
    lr.fit(features, labels)
117
    plot_predict(lr, features, labels, 'Linear')
118
    print_coefficients(lr)
119
    print(lr.score(features, labels))
120
    # # 岭回归
121
    # ridge = linear_model.Ridge(alpha=1)
122
    # ridge.fit(features, labels)
123
    # plot_predict(ridge, features, labels, 'Ridge')
124
    # print_coefficients(ridge)
125
    # # Lasso
126
    ...
127
    # # SVR
128
    ...
129
    # 基本没有区别,因为在低维小样本的情况下,大多数回归模型都会退化成线性回归
130
​
131
    # 由于线性回归中的残差存在显著的模式,这里尝试进行多项式回归,引入高次项
132
    poly_feat = ppcs.PolynomialFeatures(degree=2, include_bias=False).fit_transform(features)
133
    # 这里有四个变量,交互项 c(4, 2) = 6,加上平方项 4 个、一次项 4 个、变成 14 个特征
134
    # 因为要得出回归方程,确定一下交互项的排序(文档没写...)
135
    df = pd.DataFrame({'x1': [3], 'x2':[5], 'x3':[7], 'x4':[11]})
136
    print(ppcs.PolynomialFeatures(degree=2, include_bias=False).fit_transform(df))
137
    # 应用线性回归
138
    poly_lr = linear_model.LinearRegression()
139
    poly_lr.fit(poly_feat, labels)
140
    plot_predict(poly_lr, poly_feat, labels, LABEL)
141
    print_coefficients(poly_lr)
142
    print(poly_lr.score(poly_feat, labels))
143
    # 交叉验证评分
144
    scores = []
145
    for train, test in model_selection.KFold(5).split(poly_feat):
146
        plr = linear_model.LinearRegression()
147
        plr.fit(poly_feat[train], labels[train])
148
        s = metrics.r2_score(labels[test], plr.predict(poly_feat[test]))
149
        scores.append(s)
150
    plt.plot(scores)
151
    plt.show()
152
    # 明显的低偏差高方差,这里相当于用 5% 的【非随机采样】来预测整体(中国一共六百多个城市)
153
    # 而且可用的特征不多,能不能泛化看天意...
154
    model = {
155
        'label': LABEL, 
156
        'columns': list(features.columns),
157
        'poly-degree': 2,
158
        'intercept': float(poly_lr.intercept_), 
159
        'coefs': [float(c) for c in poly_lr.coef_]
160
    }
161
    with open(LABEL+'.json', 'w', encoding='utf-8') as f:
162
        f.write(str(model).replace(',', ',\n').replace("'", '"'))
163
​
164
    # 由于引入高次项后特征维度骤增,再次加入惩罚项看看效果
165
    # scores = []
166
    # for a in [1e-5, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5]:
167
    #     lasso = linear_model.Lasso(alpha=a, normalize=True, max_iter=int(1e5))
168
    #     lasso.fit(poly_feat, labels)
169
    #     scores.append(lasso.score(poly_feat, labels))
170
    # plt.plot(scores)
171
    # plt.show()
172
    # 分数单调递减,当惩罚系数最小(alpha -> 0)时接近线性模型拟合度
173
    # 结论:特征缩减只会提高偏差,至于方差的变化这里就无从考究了
174
​