Preface
Python is very popular now, with simple syntax and powerful functions. Many students want to learn Python! So the little ones have prepared high-value Python learning video tutorials and related electronic books for everyone. Welcome to receive them!
1. Grab the name, profile picture and age of Taobao MM
2. Grab the profile and photo pictures of each MM
3. Save each MM's photo picture to the local folder according to the folder
4. Familiar with the process of file saving
The URL we use here is http://mm.taobao.com/json/request_top_list.htm?page=1, the base address is in front of the question mark, and the parameter page behind is the number of pages. You can change the address at will. After clicking Open, you will find some introductions of Taobao MM, with hyperlinks to the personal details page.
We need to grab the avatar address of this page, MM’s name, MM’s age, MM’s place of residence, and MM’s personal details page address.
I believe that after several actual battles, everyone is very familiar with crawling and extracting the address of the page. There is no difficulty here. We first grab the MM details page address, name, age and other information of this page and print it out. Directly paste the code as follows
__author__ ='CQC' # -*- coding:utf-8 -*- import urllib import urllib2 import re class Spider: def __init__(self): self.siteURL ='http://mm.taobao.com/json/request_top_list.htm' def getPage(self,pageIndex): url = self.siteURL + "?page=" + str(pageIndex) print url request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode('gbk') def getContents(self,pageIndex): page = self.getPage(pageIndex) pattern = re.compile('<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)" .*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?) </span>',re.S) items = re.findall(pattern,page) for item in items: print item[0],item[1],item[2],item[3],item[4] spider = Spider() spider.getContents(1)
The results are as follows
Here, we have two ways to write pictures and text
#Incoming picture address, file name, save a single picture def saveImg(self,imageURL,fileName): u = urllib.urlopen(imageURL) data = u.read() f = open(fileName,'wb') f.write(data) f.close()
def saveBrief(self,content,name): fileName = name + "/" + name + ".txt" f = open(fileName,"w+") print u"is secretly saving her personal information as",fileName f.write(content.encode('utf-8'))
#Create new directory def mkdir(self,path): path = path.strip() # Determine whether the path exists # Existing True # Does not exist False isExists=os.path.exists(path) # critical result if not isExists: # Create a directory if it does not exist # Create directory operation function os.makedirs(path) return True else: # If the directory exists, do not create it, and prompt that the directory already exists return False
The main knowledge points have been covered in the previous section. If you have read the previous chapters, it is not a problem to complete this crawler. The specific details will not be repeated here, and the code will be posted directly.
spider.py
__author__ ='CQC' # -*- coding:utf-8 -*- import urllib import urllib2 import re import tool import os #Grab MM class Spider: #Page Initialization def __init__(self): self.siteURL ='http://mm.taobao.com/json/request_top_list.htm' self.tool = tool.Tool() #Get the content of the index page def getPage(self,pageIndex): url = self.siteURL + "?page=" + str(pageIndex) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode('gbk') #Get all MM information in the index interface, list format def getContents(self,pageIndex): page = self.getPage(pageIndex) pattern = re.compile('<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)" .*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?) </span>',re.S) items = re.findall(pattern,page) contents = [] for item in items: contents.append([item[0],item[1],item[2],item[3],item[4]]) return contents #Get MM personal details page def getDetailPage(self,infoURL): response = urllib2.urlopen(infoURL) return response.read().decode('gbk') #Get personal text introduction def getBrief(self,page): pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S) result = re.search(pattern,page) return self.tool.replace(result.group(1)) #Get all pictures on the page def getAllImg(self,page): pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S) #Personal information page all codes content = re.search(pattern,page) #Extract pictures from code patternImg = re.compile('<img.*?src="(.*?)"',re.S) images = re.findall(patternImg,content.group(1)) return images #Save multiple photo pictures def saveImgs(self,images,name): number = 1 print u"discovered",name,u"shared",len(images),u"photos" for imageURL in images: splitPath = imageURL.split('.') fTail = splitPath.pop() if len(fTail)> 3: fTail = "jpg" fileName = name + "/" + str(number) + "." + fTail self.saveImg(imageURL,fileName) number += 1 # Save avatar def saveIcon(self,iconURL,name): splitPath = iconURL.split('.') fTail = splitPath.pop() fileName = name + "/icon." + fTail self.saveImg(iconURL,fileName) #Save profile def saveBrief(self,content,name): fileName = name + "/" + name + ".txt" f = open(fileName,"w+") print u"is secretly saving her personal information as",fileName f.write(content.encode('utf-8')) #Incoming picture address, file name, save a single picture def saveImg(self,imageURL,fileName): u = urllib.urlopen(imageURL) data = u.read() f = open(fileName,'wb') f.write(data) print u"is quietly saving a picture of her as",fileName f.close() #Create new directory def mkdir(self,path): path = path.strip() # Determine whether the path exists # Existing True # Does not exist False isExists=os.path.exists(path) # critical result if not isExists: # Create a directory if it does not exist print u "Secretly created a new folder named ",path,u'" # Create directory operation function os.makedirs(path) return True else: # If the directory exists, do not create it, and prompt that the directory already exists print u "The folder named ",path,' has been created successfully' return False #Save a page of Taobao MM information def savePageInfo(self,pageIndex): #Get the first page of Taobao MM list contents = self.getContents(pageIndex) for item in contents: #item[0] personal details URL, item[1] avatar URL, item[2] name, item[3] age, item[4] residence print u"found a model named ",item[2],u"芳龄",item[3],u",she is",item[4] print u"secretly saving",item[2],"information" print u"accidentally discovered that her personal address is",item[0] #Personal details page URL detailURL = item[0] #Get personal details page code detailPage = self.getDetailPage(detailURL) #Get profile brief = self.getBrief(detailPage) #Get a list of all pictures images = self.getAllImg(detailPage) self.mkdir(item[2]) #Save profile self.saveBrief(brief,item[2]) #Save avatar self.saveIcon(item[1],item[2]) #save Picture self.saveImgs(images,item[2]) #Incoming start and end page number, get MM picture def savePagesInfo(self,start,end): for i in range(start,end+1): print u "is secretly looking for the ",i,u" place, see if the MMs are there" self.savePageInfo(i) #Pass in the start and end page number, here 2,10 is passed in, which means to grab the MM of the 2nd to 10th pages spider = Spider() spider.savePagesInfo(2,10)
tool.py
__author__ ='CQC' #-*- coding:utf-8 -*- import re #Processing page tag class class Tool: #Remove the img tag, 1-7 spaces, removeImg = re.compile('<img.*?>| {1,7}| ') #Remove hyperlink label removeAddr = re.compile('<a.*?>|</a>') #Change the label of the newline to\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #Replace table tabulation <td> with/t replaceTD = re.compile('<td>') #Replace newline or double newline with\n replaceBR = re.compile('<br><br>|<br>') #Remove the remaining tags removeExtraTag = re.compile('<.*?>') #Delete multiple blank lines removeNoneLine = re.compile('\n+') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) x = re.sub(self.removeNoneLine,"\n",x) #strip()Remove extra content before and after return x.strip()
The above two files are all the code content, run it and have a try, it's a sour and cool
See what's changed in the folder
Before you know it, a large number of MM pictures have entered your computer, don't hurry up and try it! !
Alright! The article is shared with the readers here
Finally, if you find it helpful, remember to follow, forward, and favorite
·END·