PyScraper: The Python Screen Scraper

I wrote a light weight python based screen scraper, which seems to be working great.

Some of the features:

Session Management when using cookies
Seperate functions for get,post and downloading large files
Automatic handling of redirections

Here is the code for it.

import httplib,urllib,random,sys,re,os
from urlparse import urlparse

class PyScraper:
 def __init__(self):
 self.cookie=""
 self.currenturl=""
 self.urlhist=[]

 def __str__(self):
 ret=""
 for item in self.urlhist:
 ret=ret item "->"
 return ret

 def download(self,url,localfolder):
 bufsize = 1024
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 head,fname = os.path.split(path)
 if(query!=""):
 path=path "?" query

 conn=httplib.HTTPConnection(hostname)
 conn.request("GET", path,None,{"Cookie":self.cookie})
 resp=conn.getresponse()
 total=int(resp.getheader("content-length"))
 f = open(localfolder "/" fname,"wb")
 sofar = 0
 while 1:
 data = resp.read(bufsize)
 f.write(data)
 sofar   = len(data)
 perc = (float(sofar)/float(total))
 count = int(perc * 20)
 sys.stdout.write("\r%-30s|%-20s|= percent" % (fname,"#"*count,perc*100))

 sys.stdout.flush()
 #sys.stdout.write("\r" str(sofar) " / " str(total))total
 if len(data)==0:
 break
 f.close()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.get(resp.getheader("location"))
 return data

 def get(self,url):
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 if(query!=""):
 path=path "?" query

 conn=httplib.HTTPConnection(hostname)
 conn.request("GET", path,None,{"Cookie":self.cookie})
 resp=conn.getresponse()
 data= resp.read()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.get(resp.getheader("location"))
 return data

 def post(self,url,data):
 self.urlhist.append(url)
 o=urlparse(url)
 scheme,hostname,path,q,query,position=o
 conn=httplib.HTTPConnection(hostname)
 conn.request("POST", path,data,
{"Content-Type":"application/x-www-form-urlencoded","Cookie":self.cookie})
 resp=conn.getresponse()
 data= resp.read()
 if(resp.getheader("set-cookie")!=None):
 self.cookie=resp.getheader("set-cookie")
 conn.close()
 if(resp.status==302 or resp.status ==301):
 return self.post(resp.getheader("location"),data)
 return data

Here is some snippet of code on how to use it.

from pyscraper import PyScraper

p=PyScraper()
data=p.get("http://www.yahoo.com/")
print data