Python como wget
Thursday, February 12th, 2009Inspirado por este e-mail, resolvi tentar escrever um `wget -r` em Python. Com a ajuda do BeautifulSoup, bastou meia-hora para chegar numa prova de conceito bem interessante…
#!/usr/bin/env python """ Proof-of-concept Python implementation for `wget -r`. Downloads only what looks like files. That is: it isn't really recursive (yet).""" from urllib import urlopen, urlretrieve from urlparse import urlsplit import sys import os from BeautifulSoup import BeautifulSoup URL = 'http://humberto.digi.com.br' print "Opening", URL, "..." b = BeautifulSoup(urlopen(URL).read()) links = [a['href'] for a in b.findAll('a')] internal = [l for l in links if l.startswith(URL)] # urlsplit: # # (u'http', # u'humberto.digi.com.br', # u'/wp-content/uploads/2008/03/delimport.png', # '', # '') site_name = urlsplit(URL)[1] if not os.path.isdir(site_name): os.mkdir(site_name) os.chdir(site_name) def reporthook(*a): sys.stdout.write('.') for url in internal: # Gets only the file path, stripping first '/' path = urlsplit(url)[2][1:] print path, if not path or path.endswith('/'): # Looks like a directory, skip print 'skip' continue # Replicate directory structure dirname, fname = os.path.split(path) if dirname and not os.path.isdir(dirname): os.makedirs(dirname) urlretrieve(url, path, reporthook) print "OK"
Esse artigo é dedicado ao FTP Offline [2001-2008] da Diginet, feito em PHP com wget.
blog










