47
1
# -*- coding: utf-8 -*-
2
"""
3
此前爬取数据的时候就是写出多个文件便于操作,这里简单的将它们合并
4
由于这里进行比较泛的数据分析,不好说应该合并成什么形式,所以还是遇到具体任务再重塑
5
"""
6
import pandas as pd
7
from meituan_spider import ALLCITIES, KEYWORD, PATH
8
# 需要去除的特殊字符
9
S = '•'
10
# 获取数据的日期
11
collecting_date = '20220803'
12
​
13
def tidy_shops(city='深圳'):
14
    df = pd.read_csv(f'{PATH}{city}{KEYWORD}_shops.csv')
15
    # 删除店铺标题中的括号内容:一点点(XXX店)
16
    df['title'] = df.title.map(lambda s: s.split('(')[0])
17
    # 替换店铺标题中的特殊字符
18
    df['title'] = df.title.str.replace(S, '·')
19
    # 提取地址中的区号生成新字段(先大致提取,有些城市可能会出现脏数据,无法全部兼顾)
20
    df['region'] = df.address.map(lambda s: s.split('区')[0] + '区')
21
    # 发现有些不是主营茶饮的,例如深圳的尊宝披萨;但是它又有茶饮产品,不好直接去掉
22
    # 于是增加一个布尔型字段区分一下是否主营奶茶
23
    df['isMain'] = df.backCateName.map(lambda cat: KEYWORD in cat)
24
    return df
25
​
26
def tidy_deals(city='深圳'):
27
    pass
28
    # 缺失重复暂时不处理,保留最大部分的原始数据
29
    
30
    return df
31
​
32
if __name__ == "__main__":
33
​
34
    shops = pd.DataFrame()
35
    for c in ALLCITIES:
36
        df = tidy_shops(c)
37
        df['city'] = df.title.map(lambda x: c)
38
        shops = shops.append(df, ignore_index=True)
39
    shops.to_csv(f'{KEYWORD}-{collecting_date}-shops.csv', index=None)
40
​
41
    deals = pd.DataFrame()
42
    for c in ALLCITIES:
43
        df = tidy_deals(c)
44
        df['city'] = df.title.map(lambda x: c)
45
        deals = deals.append(df, ignore_index=True)
46
    deals.to_csv(f'{KEYWORD}-{collecting_date}-deals.csv', index=None)
47
    # 得到的结果是全国汇总的两个表,shops 的观测为商铺,deals 的观测为一个推荐商品


66
1
# -*- coding: utf-8 -*-
2
"""
3
Further Processing
4
"""
5
​
6
import pandas as pd
7
​
8
deals = pd.read_csv('奶茶-20220803-deals.csv')
9
shops = pd.read_csv('奶茶-20220803-shops.csv')
10
​
11
# PART O: FUNCTIONS
12
​
13
def unify_values(df, col, to_value, *alters):
14
    for alt in alters:
15
        df[col] = df[col].str.replace(alt, to_value)
16
    return df
17
​
18
# PART I: PROCESSING shops
19
​
20
# filter data
21
shops = shops.drop_duplicates().reset_index().drop(['index'], axis=1)
22
shops = shops.loc[shops.isMain].drop(['isMain', 'backCateName', 'phone'], axis=1)
23
# unify top-100 counts title values
24
unique_top_list = list(shops.title.value_counts().head(100).index)
25
unique_top_list.sort()
26
replace_dict = {
27
    '1 點點': ['1點點', '一點點', '1点点', '1 點點', '1 點點 ', '1 點點奶茶'],
28
    '700CC': [ '700CC天然苏打水茶饮', '700cc都市茶饮'],
29
    'CoCo都可': ['coco都可'],
30
    'HEY JUICE 茶桔便': ['HEY JUICE茶桔便', 'HEYJUICE茶桔便'],
31
    '皇茶': ['royaltea皇茶'],
32
    '丸摩堂': ['丸摩堂100%鲜果茶'],
33
    '厝内小眷村': ['厝内小眷村cuo nei village'],
34
    '嘿糖': ['嘿糖鲜奶茶饮'],
35
    '大卡司': ['大卡司DAKASI'],
36
    '快乐柠檬': ['快乐柠檬happy lemon', '快乐柠檬happylemon', 'happy lemon快乐柠檬'],
37
    '蜜雪冰城': ['蜜雪冰城·冰淇淋与茶'],
38
    '贡茶': ['贡茶GONGCHA']
39
    }
40
for k, v in replace_dict.items():
41
    shops = unify_values(shops, 'title', k, *v)
42
# drop extreme values
43
shops = shops.loc[shops.avgprice < 100]
44
shops = shops.loc[shops.comments >= 0]
45
shops['region'] = ['Unknown' if len(r)>5 else r for r in shops.region]
46
​
47
# PART II: PROCESSING deals
48
​
49
# drop irrelative rows & sort columns
50
relative = deals.shop_id.map(lambda x: x in shops.id.values)
51
deals = deals.loc[relative, ['shop_id', 'title', 'sales', 'price', 'value', 'city']]
52
deals.drop(['value','title','city'], axis=1, inplace=True)
53
​
54
# PART III: MERGE
55
​
56
# add average sales to shops 
57
sales = deals.groupby('shop_id').sales.mean().reset_index().rename({'shop_id': 'id', 'sales': 'avgsales'}, axis=1)
58
shops = shops.merge(sales, on='id', how='outer')
59
# add average products price to shops
60
price = deals.groupby('shop_id').price.mean().reset_index().rename({'shop_id': 'id', 'price': 'product_price'}, axis=1)
61
shops = shops.merge(price, on='id', how='outer')
62
​
63
# OUTPUT
64
deals.to_csv('deals.csv', index=None)
65
shops.to_csv('shops.csv', index=None)
66
​