471# -*- coding: utf-8 -*-
2"""
3此前爬取数据的时候就是写出多个文件便于操作,这里简单的将它们合并
4由于这里进行比较泛的数据分析,不好说应该合并成什么形式,所以还是遇到具体任务再重塑
5"""
6import pandas as pd
7from meituan_spider import ALLCITIES, KEYWORD, PATH
8# 需要去除的特殊字符
9S = '•'
10# 获取数据的日期
11collecting_date = '20220803'
12
13def tidy_shops(city='深圳'):
14 df = pd.read_csv(f'{PATH}{city}{KEYWORD}_shops.csv')
15 # 删除店铺标题中的括号内容:一点点(XXX店)
16 df['title'] = df.title.map(lambda s: s.split('(')[0])
17 # 替换店铺标题中的特殊字符
18 df['title'] = df.title.str.replace(S, '·')
19 # 提取地址中的区号生成新字段(先大致提取,有些城市可能会出现脏数据,无法全部兼顾)
20 df['region'] = df.address.map(lambda s: s.split('区')[0] + '区')
21 # 发现有些不是主营茶饮的,例如深圳的尊宝披萨;但是它又有茶饮产品,不好直接去掉
22 # 于是增加一个布尔型字段区分一下是否主营奶茶
23 df['isMain'] = df.backCateName.map(lambda cat: KEYWORD in cat)
24 return df
25
26def tidy_deals(city='深圳'):
27 pass
28 # 缺失重复暂时不处理,保留最大部分的原始数据
29
30 return df
31
32if __name__ == "__main__":
33
34 shops = pd.DataFrame()
35 for c in ALLCITIES:
36 df = tidy_shops(c)
37 df['city'] = df.title.map(lambda x: c)
38 shops = shops.append(df, ignore_index=True)
39 shops.to_csv(f'{KEYWORD}-{collecting_date}-shops.csv', index=None)
40
41 deals = pd.DataFrame()
42 for c in ALLCITIES:
43 df = tidy_deals(c)
44 df['city'] = df.title.map(lambda x: c)
45 deals = deals.append(df, ignore_index=True)
46 deals.to_csv(f'{KEYWORD}-{collecting_date}-deals.csv', index=None)
47 # 得到的结果是全国汇总的两个表,shops 的观测为商铺,deals 的观测为一个推荐商品
661# -*- coding: utf-8 -*-
2"""
3Further Processing
4"""
5
6import pandas as pd
7
8deals = pd.read_csv('奶茶-20220803-deals.csv')
9shops = pd.read_csv('奶茶-20220803-shops.csv')
10
11# PART O: FUNCTIONS
12
13def unify_values(df, col, to_value, *alters):
14 for alt in alters:
15 df[col] = df[col].str.replace(alt, to_value)
16 return df
17
18# PART I: PROCESSING shops
19
20# filter data
21shops = shops.drop_duplicates().reset_index().drop(['index'], axis=1)
22shops = shops.loc[shops.isMain].drop(['isMain', 'backCateName', 'phone'], axis=1)
23# unify top-100 counts title values
24unique_top_list = list(shops.title.value_counts().head(100).index)
25unique_top_list.sort()
26replace_dict = {
27 '1 點點': ['1點點', '一點點', '1点点', '1 點點', '1 點點 ', '1 點點奶茶'],
28 '700CC': [ '700CC天然苏打水茶饮', '700cc都市茶饮'],
29 'CoCo都可': ['coco都可'],
30 'HEY JUICE 茶桔便': ['HEY JUICE茶桔便', 'HEYJUICE茶桔便'],
31 '皇茶': ['royaltea皇茶'],
32 '丸摩堂': ['丸摩堂100%鲜果茶'],
33 '厝内小眷村': ['厝内小眷村cuo nei village'],
34 '嘿糖': ['嘿糖鲜奶茶饮'],
35 '大卡司': ['大卡司DAKASI'],
36 '快乐柠檬': ['快乐柠檬happy lemon', '快乐柠檬happylemon', 'happy lemon快乐柠檬'],
37 '蜜雪冰城': ['蜜雪冰城·冰淇淋与茶'],
38 '贡茶': ['贡茶GONGCHA']
39 }
40for k, v in replace_dict.items():
41 shops = unify_values(shops, 'title', k, *v)
42# drop extreme values
43shops = shops.loc[shops.avgprice < 100]
44shops = shops.loc[shops.comments >= 0]
45shops['region'] = ['Unknown' if len(r)>5 else r for r in shops.region]
46
47# PART II: PROCESSING deals
48
49# drop irrelative rows & sort columns
50relative = deals.shop_id.map(lambda x: x in shops.id.values)
51deals = deals.loc[relative, ['shop_id', 'title', 'sales', 'price', 'value', 'city']]
52deals.drop(['value','title','city'], axis=1, inplace=True)
53
54# PART III: MERGE
55
56# add average sales to shops
57sales = deals.groupby('shop_id').sales.mean().reset_index().rename({'shop_id': 'id', 'sales': 'avgsales'}, axis=1)
58shops = shops.merge(sales, on='id', how='outer')
59# add average products price to shops
60price = deals.groupby('shop_id').price.mean().reset_index().rename({'shop_id': 'id', 'price': 'product_price'}, axis=1)
61shops = shops.merge(price, on='id', how='outer')
62
63# OUTPUT
64deals.to_csv('deals.csv', index=None)
65shops.to_csv('shops.csv', index=None)
66