from bs4 import BeautifulSoup
ALLCITIES = ['北京', '上海', '广州', '深圳', '天津', '重庆',
'石家庄', '太原', '呼和浩特', '沈阳', '长春', '哈尔滨', '南京', '杭州', '合肥',
'福州', '南昌', '济南', '郑州', '武汉', '长沙', '南宁', '海口', '成都', '贵阳',
'昆明', '拉萨', '西安', '兰州', '西宁', '银川', '乌鲁木齐']
Parameters: cityname[String] | keyword[String]
Feature: Get n-pages search resutl on meituan, with given keyword and city.
def __init__(self, cityname, keyword):
self.keyword = urllib.parse.quote(keyword)
self.linux_headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'}
self.windows_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
self.citylink = self.get_city_link()
self.host = self.citylink.split('/')[2]
self.cityid = self.get_city_id()
self.cookies = self.get_cookies()
'''Called during initializing'''
res = rq.get('https://www.meituan.com/changecity/', headers=self.windows_headers)
soup = BeautifulSoup(res.text, features='lxml')
cities = soup.find_all('a', {'class': 'link city'})
if self.name in c.text or c.text in self.name:
link = 'https:' + c.attrs['href'] + '/s/' + self.keyword
'''Called during initializing'''
headers = dict(self.windows_headers, Host=self.host)
res = rq.get(self.citylink, headers=headers)
id = re.findall(r'{"id":(\d+),"name"', res.text)[0]
'''Called during initializing'''
jar = http.cookiejar.CookieJar()
processor = urllib.request.HTTPCookieProcessor(jar)
opener = urllib.request.build_opener(processor)
_ = opener.open(self.citylink)
cookies.append(i.name + '=' + i.value)
return '; '.join(cookies)
def get_json(self, page):
'''Get data of one page'''
url = 'https://apimobile.meituan.com/group/v4/poi/pcsearch/{}'
url += '?userid=-1&limit=32&offset={}&cateId=-1&q={}'
url = url.format(self.cityid, page*32, self.keyword)
'Host': 'apimobile.meituan.com',
'Origin': 'https://' + self.host,
'Referer': self.citylink,
'User-Agent': self.windows_headers['User-Agent']
res = rq.get(url, headers=headers)
data = json.loads(res.text)
return data['data']['searchResult']
def parse_data(self, data):
'''Parse data of one page'''
def drop_keys(item, *drops):
[copy.pop(name) for name in drops]
fields = ['id', 'title', 'backCateName', 'address', 'areaname',
'latitude', 'longitude', 'avgprice', 'avgscore', 'comments', 'phone']
drops = list(set(data[0].keys()) - set(fields))
shops = pd.DataFrame([drop_keys(d, *drops) for d in data])
names = ['shop_id', 'title', 'price', 'value', 'sales']
for i in range(len(data)):
if data[i]['deals'] and type(data[i]['deals'][0]) == dict:
for d in data[i]['deals']:
d['shop_id'] = shops.loc[i, 'id']
deals = pd.DataFrame(dicts)
deals.drop(set(deals.columns)-set(names), axis=1, inplace=True)
df1, df2 = self.parse_data(self.get_json(p))
deals = deals.append(df1)
shops = shops.append(df2)
print(f'>>> Page No.{p+1} finished...')
print('ERROR: ' + str(e))
self.cookies = self.get_cookies()
if __name__ == "__main__":
results = dict.fromkeys(cities)
print(f' 正在获取 {c} 的数据 '.center(72, '='))
results[c] = MtSpider(c, keyword).main(pages)
for city, returns in results.items():
returns[0].to_csv(f'{path}{city}{keyword}_deals.csv', index=None)
returns[1].to_csv(f'{path}{city}{keyword}_shops.csv', index=None)