PyScraper: The Python Screen Scraper

I wrote a light weight python based screen scraper, which seems to be working great.

Some of the features:

Here is the code for it.

import httplib,urllib,random,sys,re,os
from urlparse import urlparse

class PyScraper:
 def __init__(self):
 self.cookie=""
 self.currenturl=""
 self.urlhist=[]

 def __str__(self):
 ret=""
 for item in self.urlhist:
 ret=ret item "->"
 return ret

 def download(self,url,localfolder):
 bufsize = 1024
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 head,fname = os.path.split(path)
 if(query!=""):
 path=path "?" query

 conn=httplib.HTTPConnection(hostname)
 conn.request("GET", path,None,{"Cookie":self.cookie})
 resp=conn.getresponse()
 total=int(resp.getheader("content-length"))
 f = open(localfolder "/" fname,"wb")
 sofar = 0
 while 1:
 data = resp.read(bufsize)
 f.write(data)
 sofar   = len(data)
 perc = (float(sofar)/float(total))
 count = int(perc * 20)
 sys.stdout.write("\r%-30s|%-20s|= percent" % (fname,"#"*count,perc*100))

 sys.stdout.flush()
 #sys.stdout.write("\r" str(sofar) " / " str(total))total
 if len(data)==0:
 break
 f.close()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.get(resp.getheader("location"))
 return data

 def get(self,url):
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 if(query!=""):
 path=path "?" query

 conn=httplib.HTTPConnection(hostname)
 conn.request("GET", path,None,{"Cookie":self.cookie})
 resp=conn.getresponse()
 data= resp.read()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.get(resp.getheader("location"))
 return data

 def post(self,url,data):
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 conn=httplib.HTTPConnection(hostname)
 conn.request("POST", path,data,
{"Content-Type":"application/x-www-form-urlencoded","Cookie":self.cookie})
 resp=conn.getresponse()
 data= resp.read()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.post(resp.getheader("location"),data)
 return data

Here is some snippet of code on how to use it.

from pyscraper import PyScraper

p=PyScraper()
data=p.get("http://www.yahoo.com/")
print data
By: gavi on: