PyScraper: The Python Screen Scraper
I wrote a light weight python based screen scraper, which seems to be working great.
Some of the features:
- Session Management when using cookies
- Seperate functions for get,post and downloading large files
- Automatic handling of redirections
Here is the code for it.
import httplib,urllib,random,sys,re,os
from urlparse import urlparse
class PyScraper:
def __init__(self):
self.cookie=""
self.currenturl=""
self.urlhist=[]
def __str__(self):
ret=""
for item in self.urlhist:
ret=ret item "->"
return ret
def download(self,url,localfolder):
bufsize = 1024
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
head,fname = os.path.split(path)
if(query!=""):
path=path "?" query
conn=httplib.HTTPConnection(hostname)
conn.request("GET", path,None,{"Cookie":self.cookie})
resp=conn.getresponse()
total=int(resp.getheader("content-length"))
f = open(localfolder "/" fname,"wb")
sofar = 0
while 1:
data = resp.read(bufsize)
f.write(data)
sofar = len(data)
perc = (float(sofar)/float(total))
count = int(perc * 20)
sys.stdout.write("\r%-30s|%-20s|= percent" % (fname,"#"*count,perc*100))
sys.stdout.flush()
#sys.stdout.write("\r" str(sofar) " / " str(total))total
if len(data)==0:
break
f.close()
if(resp.getheader("set-cookie")!=None):
self.cookie=resp.getheader("set-cookie")
conn.close()
if(resp.status==302 or resp.status ==301):
return self.get(resp.getheader("location"))
return data
def get(self,url):
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
if(query!=""):
path=path "?" query
conn=httplib.HTTPConnection(hostname)
conn.request("GET", path,None,{"Cookie":self.cookie})
resp=conn.getresponse()
data= resp.read()
if(resp.getheader("set-cookie")!=None):
self.cookie=resp.getheader("set-cookie")
conn.close()
if(resp.status==302 or resp.status ==301):
return self.get(resp.getheader("location"))
return data
def post(self,url,data):
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
conn=httplib.HTTPConnection(hostname)
conn.request("POST", path,data,
{"Content-Type":"application/x-www-form-urlencoded","Cookie":self.cookie})
resp=conn.getresponse()
data= resp.read()
if(resp.getheader("set-cookie")!=None):
self.cookie=resp.getheader("set-cookie")
conn.close()
if(resp.status==302 or resp.status ==301):
return self.post(resp.getheader("location"),data)
return data
Here is some snippet of code on how to use it.
from pyscraper import PyScraper
p=PyScraper()
data=p.get("http://www.yahoo.com/")
print data
By: gavi on: