Python Crawlers: Depth & Scrape Website for HTML Code & Create csv file containing Links

Tuesday, September 15, 2015

Depth & Scrape Website for HTML Code & Create csv file containing Links

Depth & Scrape Website for HTML Code & Create csv file containing Links #Scrape Website for HTML Code and Create csv file containing Links
# Using Python 2.7.9 - By Rami Jaloudi, Programmer
import urllib
import re
import os

def crawl(site, depth, linksfile):
pattern = re.compile(r'href="(http://.*?)"')
f = open(linksfile, 'a+')
try:
    if depth < MAX_DEPTH:
      print 'crawling [%s]...' % site,
      print >> f, '[%s]' % site
      url = urllib.urlopen(site)
      content = url.read()
      print content
      hits = pattern.findall(content)
      for hit in hits:
        print >> f, hit
      print 'done.'
      print >> f, ''
      for hit in hits:
        crawl(hit, depth + 1, linksfile)

except:
    pass
f.close()

MAX_DEPTH=3
base = r'http://nytimes.com'
linksfile = r'links.txt'

if os.path.isfile(linksfile):
os.remove(linksfile)
crawl(base, 0, linksfile)

1 comment:

RobertNovember 21, 2015 at 1:27 AM
Thanks for the blog loaded with so many information. Stopping by your blog helped me to get what I was looking for. responsive web development
ReplyDelete
Replies

Add comment