Mainly use requests and bf4 two libraries to save the obtained information under d://hotsearch.txt
import requests;import bs4mylist=[]r = requests.get(url='https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6',timeout=10)print(r.status_code) # Get the return status r.encoding=r.apparent_encodingdemo = r.textfrom bs4 import BeautifulSoupsoup = BeautifulSoup(demo,"html.parser")for link in soup.find('tbody') :hotnumber=``if isinstance(link, bs4.element.Tag):# print(link('td'))lis=link('td')hotrank=lis[1]('a')[0].string#Hot search ranking hotname=lis[1 ].find('span')#Hot search name if isinstance(hotname,bs4.element.Tag):hotnumber=hotname.string#Hot search index passmylist.append([lis[0].string,hotrank,hotnumber,lis [2].string])f=open("d://hotsearch.txt","w+")for line in mylist:f.write('%s %s %s %s\n'%(line[ 0],line[1],line[2],line[3]))
Crawling Weibo Hot Search
import scheduleimport pandas as pdfrom datetime import datetimeimport requestsfrom bs4 import BeautifulSoup url = "https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102"get_info_dict = {}count = 0 def main():global url, get_info_dict, countget_info_list = []print("Crawling data~~~")html = requests.get(url).textsoup = BeautifulSoup(html,'lxml')for tr in soup. find_all(name='tr', class_=''):get_info = get_info_dict.copy()get_info['title'] = tr.find(class_='td-02').find(name='a'). texttry:get_info['num'] = eval(tr.find(class_='td-02').find(name='span').text)except AttributeError:get_info['num'] = Noneget_info['time' ] = datetime.now().strftime("%Y/%m/%d %H:%M")get_info_list.append(get_info)get_info_list = get_info_list[1:16]df = pd.DataFrame(get_info_list)if count == 0:df.to_csv('datas.csv', mode='a+', index=False, encoding='gbk ')count += 1else:df.to_csv('datas.csv', mode='a+', index=False, header=False, encoding='gbk') # Timing crawler schedule.every(1).minutes.do (main) while True:schedule.run_pending()
pyecharts data analysis
import pandas as pdfrom pyecharts import options as optsfrom pyecharts.charts import Bar, Timeline, Gridfrom pyecharts.globals import ThemeType, CurrentConfig df = pd.read_csv('datas.csv', encoding='gbk')print(df)t = Timeline (init_opts=opts.InitOpts(theme=ThemeType.MACARONS)) # Custom theme for i in range(int(df.shape[0]/15)):bar = (Bar().add_xaxis(list(df['title '][i*15: i*15+15][::-1])) # x-axis data. add_yaxis('num', list(df['num'][i*15: i*15+15 ][::-1])) # y-axis data.reversal_axis() # flip.set_global_opts( # Global configuration item title_opts=opts.TitleOpts( # Title configuration item title=f"{list(df['time']) [i * 15]}",pos_right="5%", pos_bottom="15%",title_textstyle_opts=opts.TextStyleOpts(font_family='KaiTi', font_size=24, color='#FF1493')),xaxis_opts=opts .AxisOpts( # x-axis configuration item splitline_opts=opts.SplitLineOpts(is_show=True),),yaxis_opts=opts.AxisOpts( # y-axis configuration item splitline_opts=opts.SplitLineOpts(is_show=True),axislabel_opts=opts.LabelOpts(color='#DC143C'))). #set_series_opts( Series configuration items label_opts=opts.LabelOpts( # Label configuration position="right", color='#9400D3'))) grid = (Grid().add(bar, grid_opts=opts.GridOpts(pos_left="24%" )))t.add(grid, "")t.add_schema(play_interval=1000, # Carousel speed is_timeline_show=False, # Whether to display the timeline component is_auto_play=True, # Whether to automatically play) t.render('Time Carousel) Figure.html')add(grid, "")t.add_schema(play_interval=1000, # Carousel speed is_timeline_show=False, # Whether to display the timeline component is_auto_play=True, # Whether to automatically play) t.render('Time Carousel Diagram.html')add(grid, "")t.add_schema(play_interval=1000, # Carousel speed is_timeline_show=False, # Whether to display the timeline component is_auto_play=True, # Whether to automatically play) t.render('Time Carousel Diagram.html')
So far this article on how to use python to crawl and save Weibo hot search data is introduced here!