The project mainly crawls all the rental housing data of Beijing, Shanghai, Guangzhou and Shenzhen Lianjia.com, and draws suggestions on rent distribution, rental consideration factors, etc.
Operating environment:
ps: Here I recommend my python zero-based system learning exchange deduction qun: 322795889, if you don’t understand python (learning method, learning route, how to learn efficiently), you can add it, there are good learning tutorials in the group , Development tools, e-book sharing. Professional teacher answering questions
Source code display
import os import re import time import requests from pymongo import MongoClient from info import rent_type, city_info class Rent(object): """ Initialization function to obtain the rental type (whole rent, shared rent), city partition information to be crawled, and connect to mongodb database """ def __init__(self): self.rent_type = rent_type self.city_info = city_info host = os.environ.get('MONGODB_HOST', '127.0.0.1') # local database port = os.environ.get('MONGODB_PORT', '27017') # database port mongo_url ='mongodb://{}:{}'.format(host, port) mongo_db = os.environ.get('MONGODB_DATABASE','Lianjia') client = MongoClient(mongo_url) self.db = client[mongo_db] self.db['zufang'].create_index('m_url', unique=True) # Use the m-end link as the primary key for deduplication def get_data(self): """ Crawl rental information of different types of rental houses, different cities and regions :return: None """ for ty, type_code in self.rent_type.items(): # whole lease, shared lease for city, info in self.city_info.items(): # City, city and district information for dist, dist_py in info[2].items(): # Each area and its pinyin res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py)) pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline/">" bc_list = re.findall(pa_bc, res_bc.text) self._write_bc(bc_list) bc_list = self._read_bc() # First crawl the business districts of each district, and finally crawl the data in each district business district. If you climb by district, you can only get up to 2000 data in each district if len(bc_list)> 0: for bc_name in bc_list: idx = 0 has_more = 1 while has_more: try: url ='https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}'/ '/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0], bc_name, type_code, idx*30, int(time.time())) res = requests.get(url=url, timeout=10) print('Successfully crawled the data of {} page{} of {}市{}-{}!'.format(city, dist, bc_name, ty, idx+1)) item = {'city': city,'type': ty,'dist': dist} self._parse_record(res.json()['data']['list'], item) total = res.json()['data']['total'] idx += 1 if total/30 <= idx: has_more = 0 # time.sleep(random.random()) except: print('The link access is unsuccessful, retrying!') def _parse_record(self, data, item): """ Parsing function, used to parse the json data of the response crawled back :param data: a list containing housing data :param item: pass dictionary :return: None """ if len(data)> 0: for rec in data: item['bedroom_num'] = rec.get('frame_bedroom_num') item['hall_num'] = rec.get('frame_hall_num') item['bathroom_num'] = rec.get('frame_bathroom_num') item['rent_area'] = rec.get('rent_area') item['house_title'] = rec.get('house_title') item['resblock_name'] = rec.get('resblock_name') item['bizcircle_name'] = rec.get('bizcircle_name') item['layout'] = rec.get('layout') item['rent_price_listing'] = rec.get('rent_price_listing') item['house_tag'] = self._parse_house_tags(rec.get('house_tags')) item['frame_orientation'] = rec.get('frame_orientation') item['m_url'] = rec.get('m_url') item['rent_price_unit'] = rec.get('rent_price_unit') try: res2 = requests.get(item['m_url'], timeout=5) pa_lon = r"longitude:'(.*)'," pa_lat = r"latitude:'(.*)'" pa_distance = r"<span class=\"fr\">(\d*)米</span>" item['longitude'] = re.findall(pa_lon, res2.text)[0] item['latitude'] = re.findall(pa_lat, res2.text)[0] distance = re.findall(pa_distance, res2.text) if len(distance)> 0: item['distance'] = distance[0] else: item['distance'] = None except: item['longitude'] = None item['latitude'] = None item['distance'] = None self.db['zufang'].update_one({'m_url': item['m_url']}, {'$set': item}, upsert=True) print('Successfully saved data: ()!'.format(item)) @staticmethod def _parse_house_tags(house_tag): """ Processing the house_tags field is equivalent to data cleaning :param house_tag: data in the house_tags field :return: processed house_tags """ if len(house_tag)> 0: st ='' for tag in house_tag: st += tag.get('name') + '' return st.strip() @staticmethod def _write_bc(bc_list): """ Write the crawled business district into txt, in order to make the entire crawling process more controllable :param bc_list: business district list :return: None """ with open('bc_list.txt','w') as f: for bc in bc_list: f.write(bc+'\n') @staticmethod def _read_bc(): """ Read into the business district :return: None """ with open('bc_list.txt','r') as f: return [bc.strip() for bc in f.readlines()] if __name__ =='__main__': rent = Rent() rent.get_data()
Alright! The article is shared with the readers here
Finally, if you find it helpful, remember to follow, forward, and favorite
·END·