471# -*- coding: utf-8 -*-2"""3此前爬取数据的时候就是写出多个文件便于操作,这里简单的将它们合并4由于这里进行比较泛的数据分析,不好说应该合并成什么形式,所以还是遇到具体任务再重塑5"""6import pandas as pd7from meituan_spider import ALLCITIES, KEYWORD, PATH8# 需要去除的特殊字符9S = '•'10# 获取数据的日期11collecting_date = '20220803'1213def tidy_shops(city='深圳'):14 df = pd.read_csv(f'{PATH}{city}{KEYWORD}_shops.csv')15 # 删除店铺标题中的括号内容:一点点(XXX店)16 df['title'] = df.title.map(lambda s: s.split('(')[0])17 # 替换店铺标题中的特殊字符18 df['title'] = df.title.str.replace(S, '·')19 # 提取地址中的区号生成新字段(先大致提取,有些城市可能会出现脏数据,无法全部兼顾)20 df['region'] = df.address.map(lambda s: s.split('区')[0] + '区')21 # 发现有些不是主营茶饮的,例如深圳的尊宝披萨;但是它又有茶饮产品,不好直接去掉22 # 于是增加一个布尔型字段区分一下是否主营奶茶23 df['isMain'] = df.backCateName.map(lambda cat: KEYWORD in cat)24 return df2526def tidy_deals(city='深圳'):27 pass28 # 缺失重复暂时不处理,保留最大部分的原始数据29 30 return df3132if __name__ == "__main__":3334 shops = pd.DataFrame()35 for c in ALLCITIES:36 df = tidy_shops(c)37 df['city'] = df.title.map(lambda x: c)38 shops = shops.append(df, ignore_index=True)39 shops.to_csv(f'{KEYWORD}-{collecting_date}-shops.csv', index=None)4041 deals = pd.DataFrame()42 for c in ALLCITIES:43 df = tidy_deals(c)44 df['city'] = df.title.map(lambda x: c)45 deals = deals.append(df, ignore_index=True)46 deals.to_csv(f'{KEYWORD}-{collecting_date}-deals.csv', index=None)47 # 得到的结果是全国汇总的两个表,shops 的观测为商铺,deals 的观测为一个推荐商品661# -*- coding: utf-8 -*-2"""3Further Processing4"""56import pandas as pd78deals = pd.read_csv('奶茶-20220803-deals.csv')9shops = pd.read_csv('奶茶-20220803-shops.csv')1011# PART O: FUNCTIONS1213def unify_values(df, col, to_value, *alters):14 for alt in alters:15 df[col] = df[col].str.replace(alt, to_value)16 return df1718# PART I: PROCESSING shops1920# filter data21shops = shops.drop_duplicates().reset_index().drop(['index'], axis=1)22shops = shops.loc[shops.isMain].drop(['isMain', 'backCateName', 'phone'], axis=1)23# unify top-100 counts title values24unique_top_list = list(shops.title.value_counts().head(100).index)25unique_top_list.sort()26replace_dict = {27 '1 點點': ['1點點', '一點點', '1点点', '1 點點', '1 點點 ', '1 點點奶茶'],28 '700CC': [ '700CC天然苏打水茶饮', '700cc都市茶饮'],29 'CoCo都可': ['coco都可'],30 'HEY JUICE 茶桔便': ['HEY JUICE茶桔便', 'HEYJUICE茶桔便'],31 '皇茶': ['royaltea皇茶'],32 '丸摩堂': ['丸摩堂100%鲜果茶'],33 '厝内小眷村': ['厝内小眷村cuo nei village'],34 '嘿糖': ['嘿糖鲜奶茶饮'],35 '大卡司': ['大卡司DAKASI'],36 '快乐柠檬': ['快乐柠檬happy lemon', '快乐柠檬happylemon', 'happy lemon快乐柠檬'],37 '蜜雪冰城': ['蜜雪冰城·冰淇淋与茶'],38 '贡茶': ['贡茶GONGCHA']39 }40for k, v in replace_dict.items():41 shops = unify_values(shops, 'title', k, *v)42# drop extreme values43shops = shops.loc[shops.avgprice < 100]44shops = shops.loc[shops.comments >= 0]45shops['region'] = ['Unknown' if len(r)>5 else r for r in shops.region]4647# PART II: PROCESSING deals4849# drop irrelative rows & sort columns50relative = deals.shop_id.map(lambda x: x in shops.id.values)51deals = deals.loc[relative, ['shop_id', 'title', 'sales', 'price', 'value', 'city']]52deals.drop(['value','title','city'], axis=1, inplace=True)5354# PART III: MERGE5556# add average sales to shops 57sales = deals.groupby('shop_id').sales.mean().reset_index().rename({'shop_id': 'id', 'sales': 'avgsales'}, axis=1)58shops = shops.merge(sales, on='id', how='outer')59# add average products price to shops60price = deals.groupby('shop_id').price.mean().reset_index().rename({'shop_id': 'id', 'price': 'product_price'}, axis=1)61shops = shops.merge(price, on='id', how='outer')6263# OUTPUT64deals.to_csv('deals.csv', index=None)65shops.to_csv('shops.csv', index=None)66