Tuesday, September 15, 2015

Python Crawlers | Scraping Links from Websites using Python 2.7.9

Depth & Scrape Website for HTML Code & Create csv file containing Links #Python 2.7.9
# By Rami Jaloudi, Programmer
#Scraping links with Python 2.7.9

import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize

url = "http://........com" # Enter your preferred URL
#url = "http://nytimes.com"
br = mechanize.Browser()
urls = [url]
visited = [url]
while len(urls)>0:
    try:
        br.open(urls[0])
        urls.pop(0)
        with open("links.csv", "a") as file:
            for link in br.links():
                newurl = urlparse.urljoin(link.base_url,link.url)
                b1 = urlparse.urlparse(newurl).hostname
                b2 = urlparse.urlparse(newurl).path
                newurl =  "http://"+b1+b2
                #file.write(newurl + "\n")

                if newurl not in visited and urlparse.urlparse(url).hostname in newurl:
                    urls.append(newurl)
                    visited.append(newurl)
                    print newurl
                    # SaveFile = open("links.csv", "tw")
                    file.write(newurl + "\n\n")
                    # SaveFile.close()
                   
    except:
        print "error"
        urls.pop(0)
  
'''
    print(the_page)
    SaveFile = open('the_page.txt', 'w')
    SaveFile.write(str(the_page))
    SaveFile.close()
 '''      

No comments:

Post a Comment