Python Crawlers: September 2015

Tuesday, September 15, 2015

Depth & Scrape Website for HTML Code & Create csv file containing Links

Depth & Scrape Website for HTML Code & Create csv file containing Links #Scrape Website for HTML Code and Create csv file containing Links
# Using Python 2.7.9 - By Rami Jaloudi, Programmer
import urllib
import re
import os

def crawl(site, depth, linksfile):
pattern = re.compile(r'href="(http://.*?)"')
f = open(linksfile, 'a+')
try:
    if depth < MAX_DEPTH:
      print 'crawling [%s]...' % site,
      print >> f, '[%s]' % site
      url = urllib.urlopen(site)
      content = url.read()
      print content
      hits = pattern.findall(content)
      for hit in hits:
        print >> f, hit
      print 'done.'
      print >> f, ''
      for hit in hits:
        crawl(hit, depth + 1, linksfile)

except:
    pass
f.close()

MAX_DEPTH=3
base = r'http://nytimes.com'
linksfile = r'links.txt'

if os.path.isfile(linksfile):
os.remove(linksfile)
crawl(base, 0, linksfile)

Python Crawlers | Scraping Links from Websites using Python 2.7.9

Depth & Scrape Website for HTML Code & Create csv file containing Links #Python 2.7.9
# By Rami Jaloudi, Programmer
#Scraping links with Python 2.7.9

import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize

url = "http://........com" # Enter your preferred URL
#url = "http://nytimes.com"
br = mechanize.Browser()
urls = [url]
visited = [url]
while len(urls)>0:
    try:
        br.open(urls[0])
        urls.pop(0)
        with open("links.csv", "a") as file:
            for link in br.links():
                newurl = urlparse.urljoin(link.base_url,link.url)
                b1 = urlparse.urlparse(newurl).hostname
                b2 = urlparse.urlparse(newurl).path
                newurl = "http://"+b1+b2
                #file.write(newurl + "\n")

                if newurl not in visited and urlparse.urlparse(url).hostname in newurl:
                    urls.append(newurl)
                    visited.append(newurl)
                    print newurl
                    # SaveFile = open("links.csv", "tw")
                    file.write(newurl + "\n\n")
                    # SaveFile.close()

    except:
        print "error"
        urls.pop(0)

'''
    print(the_page)
    SaveFile = open('the_page.txt', 'w')
    SaveFile.write(str(the_page))
    SaveFile.close()
'''