161
1
# -*- coding: utf-8 -*-
2
"""
3
自动获取不同城市不同搜索关键词的美团搜索结果
4
"""
5
import re
6
import time
7
import json
8
import urllib
9
import http.cookiejar
10
import pandas as pd
11
import requests as rq
12
from bs4 import BeautifulSoup
13
​
14
ALLCITIES = ['北京', '上海', '广州', '深圳', '天津', '重庆', 
15
    '石家庄', '太原', '呼和浩特', '沈阳', '长春', '哈尔滨', '南京', '杭州', '合肥',
16
    '福州', '南昌', '济南', '郑州', '武汉', '长沙', '南宁', '海口', '成都', '贵阳', 
17
    '昆明', '拉萨', '西安', '兰州', '西宁', '银川', '乌鲁木齐']
18
KEYWORD = '奶茶'
19
PATH = './data/'
20
​
21
class MtSpider:
22
    '''
23
    Parameters: cityname[String] | keyword[String]
24
    Feature: Get n-pages search resutl on meituan, with given keyword and city.
25
    '''
26
​
27
    def __init__(self, cityname, keyword):
28
        '''Initialization'''
29
​
30
        self.name = cityname
31
        self.keyword = urllib.parse.quote(keyword)
32
        self.linux_headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'}
33
        self.windows_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
34
        self.citylink = self.get_city_link()    # Searching URL
35
        self.host = self.citylink.split('/')[2] # City hostname
36
        self.cityid = self.get_city_id()
37
        self.cookies = self.get_cookies()
38
​
39
​
40
    def get_city_link(self):
41
        '''Called during initializing'''
42
​
43
        res = rq.get('https://www.meituan.com/changecity/', headers=self.windows_headers)
44
        soup = BeautifulSoup(res.text, features='lxml')
45
        cities = soup.find_all('a', {'class': 'link city'})
46
        for c in cities:
47
            if self.name in c.text or c.text in self.name:
48
                link = 'https:' + c.attrs['href'] + '/s/' + self.keyword
49
​
50
        return link
51
​
52
​
53
    def get_city_id(self):
54
        '''Called during initializing'''
55
​
56
        headers = dict(self.windows_headers, Host=self.host)
57
        res = rq.get(self.citylink, headers=headers)
58
        id = re.findall(r'{"id":(\d+),"name"', res.text)[0]
59
​
60
        return id
61
​
62
​
63
    def get_cookies(self):
64
        '''Called during initializing'''
65
​
66
​
67
        jar = http.cookiejar.CookieJar()
68
        processor = urllib.request.HTTPCookieProcessor(jar)
69
        opener = urllib.request.build_opener(processor)
70
​
71
        _ = opener.open(self.citylink)
72
        cookies = []
73
        for i in jar:
74
            cookies.append(i.name + '=' + i.value)
75
        
76
        return '; '.join(cookies)
77
​
78
​
79
    def get_json(self, page):
80
        '''Get data of one page'''
81
​
82
        url = 'https://apimobile.meituan.com/group/v4/poi/pcsearch/{}'
83
        url += '?userid=-1&limit=32&offset={}&cateId=-1&q={}'
84
        url = url.format(self.cityid, page*32, self.keyword)    # API URL
85
        headers = {
86
            'Cookie': self.cookies,
87
            'Host': 'apimobile.meituan.com',
88
            'Origin': 'https://' + self.host,
89
            'Referer': self.citylink,
90
            'User-Agent': self.windows_headers['User-Agent']
91
            }
92
        res = rq.get(url, headers=headers)
93
        data = json.loads(res.text)
94
​
95
        return data['data']['searchResult']
96
    
97
​
98
    def parse_data(self, data):
99
        '''Parse data of one page'''
100
​
101
        # Shops Infomations
102
        def drop_keys(item, *drops):
103
            copy = item.copy()
104
            [copy.pop(name) for name in drops]
105
            return copy
106
        fields = ['id', 'title', 'backCateName', 'address', 'areaname', 
107
            'latitude', 'longitude', 'avgprice', 'avgscore', 'comments', 'phone']
108
        drops = list(set(data[0].keys()) - set(fields))
109
        shops = pd.DataFrame([drop_keys(d, *drops) for d in data])
110
​
111
        # Deals Infomations
112
        names = ['shop_id', 'title', 'price', 'value', 'sales']
113
        dicts = []
114
        # Add Foreign key to shops & Flattening:
115
        for i in range(len(data)):
116
            if data[i]['deals'] and type(data[i]['deals'][0]) == dict:
117
                for d in data[i]['deals']:
118
                    d['shop_id'] = shops.loc[i, 'id']
119
                    dicts.append(d)
120
        deals = pd.DataFrame(dicts)
121
        deals.drop(set(deals.columns)-set(names), axis=1, inplace=True)
122
​
123
        return deals, shops
124
​
125
​
126
    def main(self, pages):
127
        '''Entry'''
128
​
129
        deals = pd.DataFrame()
130
        shops = pd.DataFrame()
131
        for p in range(pages):
132
            try:
133
                df1, df2 = self.parse_data(self.get_json(p))
134
                deals = deals.append(df1)
135
                shops = shops.append(df2)
136
                print(f'>>> Page No.{p+1} finished...')
137
            except Exception as e:
138
                print('ERROR: ' + str(e))
139
                self.cookies = self.get_cookies()   # Update Cookie
140
                continue
141
            time.sleep(1)
142
        
143
        return deals, shops
144
​
145
​
146
if __name__ == "__main__":
147
​
148
    # Example
149
    keyword = KEYWORD
150
    cities = ALLCITIES
151
    path = PATH
152
    pages = 30
153
​
154
    results = dict.fromkeys(cities)
155
    for c in cities:
156
        print(f'  正在获取 {c} 的数据  '.center(72, '='))
157
        results[c] = MtSpider(c, keyword).main(pages)
158
    for city, returns in results.items():
159
        returns[0].to_csv(f'{path}{city}{keyword}_deals.csv', index=None)
160
        returns[1].to_csv(f'{path}{city}{keyword}_shops.csv', index=None)
161
​