Python crawler: crawl and analyze all the data of Beishang, Guangzhou and Shenzhen rental housing

Python crawler: crawl and analyze all the data of Beishang, Guangzhou and Shenzhen rental housing

The project mainly crawls all the rental housing data of Beijing, Shanghai, Guangzhou and Shenzhen Lianjia.com, and draws suggestions on rent distribution, rental consideration factors, etc.

Operating environment:

  • python3.6

Packages that need to be installed:

  • requests
  • pyecharts
  • pandas
  • numpy
  • pymongo

Python environment

  • Python3.6

Packages need to be installed

  • requests
  • pyecharts
  • pandas
  • numpy
  • pymongo

ps: Here I recommend my python zero-based system learning exchange deduction qun: 322795889, if you don’t understand python (learning method, learning route, how to learn efficiently), you can add it, there are good learning tutorials in the group , Development tools, e-book sharing. Professional teacher answering questions

Source code display

import os
import re
import time
import requests
from pymongo import MongoClient
from info import rent_type, city_info


class Rent(object):
    """
    Initialization function to obtain the rental type (whole rent, shared rent), city partition information to be crawled, and connect to mongodb database
    """
    def __init__(self):
        self.rent_type = rent_type
        self.city_info = city_info

        host = os.environ.get('MONGODB_HOST', '127.0.0.1') # local database
        port = os.environ.get('MONGODB_PORT', '27017') # database port
        mongo_url ='mongodb://{}:{}'.format(host, port)
        mongo_db = os.environ.get('MONGODB_DATABASE','Lianjia')
        client = MongoClient(mongo_url)
        self.db = client[mongo_db]
        self.db['zufang'].create_index('m_url', unique=True) # Use the m-end link as the primary key for deduplication

    def get_data(self):
        """
        Crawl rental information of different types of rental houses, different cities and regions
        :return: None
        """
        for ty, type_code in self.rent_type.items(): # whole lease, shared lease
            for city, info in self.city_info.items(): # City, city and district information
                for dist, dist_py in info[2].items(): # Each area and its pinyin
                    res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
                    pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline/">"
                    bc_list = re.findall(pa_bc, res_bc.text)
                    self._write_bc(bc_list)
                    bc_list = self._read_bc() # First crawl the business districts of each district, and finally crawl the data in each district business district. If you climb by district, you can only get up to 2000 data in each district

                    if len(bc_list)> 0:
                        for bc_name in bc_list:
                            idx = 0
                            has_more = 1
                            while has_more:
                                try:
                                    url ='https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}'/
                                          '/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
                                                                                                     bc_name,
                                                                                                     type_code,
                                                                                                     idx*30,
                                                                                                     int(time.time()))
                                    res = requests.get(url=url, timeout=10)
                                    print('Successfully crawled the data of {} page{} of {}市{}-{}!'.format(city, dist, bc_name, ty, idx+1))
                                    item = {'city': city,'type': ty,'dist': dist}
                                    self._parse_record(res.json()['data']['list'], item)

                                    total = res.json()['data']['total']
                                    idx += 1
                                    if total/30 <= idx:
                                        has_more = 0
                                    # time.sleep(random.random())
                                except:
                                    print('The link access is unsuccessful, retrying!')

    def _parse_record(self, data, item):
        """
        Parsing function, used to parse the json data of the response crawled back
        :param data: a list containing housing data
        :param item: pass dictionary
        :return: None
        """
        if len(data)> 0:
            for rec in data:
                item['bedroom_num'] = rec.get('frame_bedroom_num')
                item['hall_num'] = rec.get('frame_hall_num')
                item['bathroom_num'] = rec.get('frame_bathroom_num')
                item['rent_area'] = rec.get('rent_area')
                item['house_title'] = rec.get('house_title')
                item['resblock_name'] = rec.get('resblock_name')
                item['bizcircle_name'] = rec.get('bizcircle_name')
                item['layout'] = rec.get('layout')
                item['rent_price_listing'] = rec.get('rent_price_listing')
                item['house_tag'] = self._parse_house_tags(rec.get('house_tags'))
                item['frame_orientation'] = rec.get('frame_orientation')
                item['m_url'] = rec.get('m_url')
                item['rent_price_unit'] = rec.get('rent_price_unit')

                try:
                    res2 = requests.get(item['m_url'], timeout=5)
                    pa_lon = r"longitude:'(.*)',"
                    pa_lat = r"latitude:'(.*)'"
                    pa_distance = r"<span class=\"fr\">(\d*)米</span>"
                    item['longitude'] = re.findall(pa_lon, res2.text)[0]
                    item['latitude'] = re.findall(pa_lat, res2.text)[0]
                    distance = re.findall(pa_distance, res2.text)
                    if len(distance)> 0:
                        item['distance'] = distance[0]
                    else:
                        item['distance'] = None
                except:
                    item['longitude'] = None
                    item['latitude'] = None
                    item['distance'] = None

                self.db['zufang'].update_one({'m_url': item['m_url']}, {'$set': item}, upsert=True)
                print('Successfully saved data: ()!'.format(item))

    @staticmethod
    def _parse_house_tags(house_tag):
        """
        Processing the house_tags field is equivalent to data cleaning
        :param house_tag: data in the house_tags field
        :return: processed house_tags
        """
        if len(house_tag)> 0:
            st =''
            for tag in house_tag:
                st += tag.get('name') + ''
            return st.strip()

    @staticmethod
    def _write_bc(bc_list):
        """
        Write the crawled business district into txt, in order to make the entire crawling process more controllable
        :param bc_list: business district list
        :return: None
        """
        with open('bc_list.txt','w') as f:
            for bc in bc_list:
                f.write(bc+'\n')

    @staticmethod
    def _read_bc():
        """
        Read into the business district
        :return: None
        """
        with open('bc_list.txt','r') as f:
            return [bc.strip() for bc in f.readlines()]


if __name__ =='__main__':
    rent = Rent()
    rent.get_data()

Alright! The article is shared with the readers here

Finally, if you find it helpful, remember to follow, forward, and favorite

·END·

Reference: https://cloud.tencent.com/developer/article/1481359 python crawler: crawl and analyze all the data of Beijing, Shanghai, Guangzhou and Shenzhen rental housing-Cloud + Community-Tencent Cloud