Python Crawlers: 2015

Tuesday, September 15, 2015

Depth & Scrape Website for HTML Code & Create csv file containing Links

Depth & Scrape Website for HTML Code & Create csv file containing Links #Scrape Website for HTML Code and Create csv file containing Links
# Using Python 2.7.9 - By Rami Jaloudi, Programmer
import urllib
import re
import os

def crawl(site, depth, linksfile):
pattern = re.compile(r'href="(http://.*?)"')
f = open(linksfile, 'a+')
try:
    if depth < MAX_DEPTH:
      print 'crawling [%s]...' % site,
      print >> f, '[%s]' % site
      url = urllib.urlopen(site)
      content = url.read()
      print content
      hits = pattern.findall(content)
      for hit in hits:
        print >> f, hit
      print 'done.'
      print >> f, ''
      for hit in hits:
        crawl(hit, depth + 1, linksfile)

except:
    pass
f.close()

MAX_DEPTH=3
base = r'http://nytimes.com'
linksfile = r'links.txt'

if os.path.isfile(linksfile):
os.remove(linksfile)
crawl(base, 0, linksfile)

Python Crawlers | Scraping Links from Websites using Python 2.7.9

Depth & Scrape Website for HTML Code & Create csv file containing Links #Python 2.7.9
# By Rami Jaloudi, Programmer
#Scraping links with Python 2.7.9

import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize

url = "http://........com" # Enter your preferred URL
#url = "http://nytimes.com"
br = mechanize.Browser()
urls = [url]
visited = [url]
while len(urls)>0:
    try:
        br.open(urls[0])
        urls.pop(0)
        with open("links.csv", "a") as file:
            for link in br.links():
                newurl = urlparse.urljoin(link.base_url,link.url)
                b1 = urlparse.urlparse(newurl).hostname
                b2 = urlparse.urlparse(newurl).path
                newurl = "http://"+b1+b2
                #file.write(newurl + "\n")

                if newurl not in visited and urlparse.urlparse(url).hostname in newurl:
                    urls.append(newurl)
                    visited.append(newurl)
                    print newurl
                    # SaveFile = open("links.csv", "tw")
                    file.write(newurl + "\n\n")
                    # SaveFile.close()

    except:
        print "error"
        urls.pop(0)

'''
    print(the_page)
    SaveFile = open('the_page.txt', 'w')
    SaveFile.write(str(the_page))
    SaveFile.close()
'''

Thursday, June 11, 2015

Web Crawling - Using regex module (Python)

Depth & Scrape Website for HTML Code & Create csv file containing Links # This script uses the regex and urllib modules to crawl a list of sites for title tags.
# It simply provides the title tags, i.e., titles you would see on web browser tabs.
# By Rami Jaloudi, Programmer
import re, urllib
try:
    import urllib.request # this is if you are running on Python 3
except:
    pass

sites = 'cnn nytimes bloomberg'.split()

pat = re.compile(r'<title>.+?</title>+', re.I|re.M)

for s in sites:
    print('Searching:' + s)
    try:
        u = urllib.urlopen('http://' + s + '.com')
    except:
        u = urllib.request.urlopen('http://' + s + '.com')
    text = u.read()
    title = re.findall(pat, str(text))
    print title
    print '\n'

Web Crawling - Using urllib module (Python)

Depth & Scrape Website for HTML Code & Create csv file containing Links import urllib

# Web Crawling - Using urllib module (Python) # By Rami Jaloudi, Programmer # This script runs on Python 2.7
# If using Python 3, please note that the urllib module has been split into parts and renamed as follows:
# urllib.request, urllib.parse, and urllib.error

url = "http://www.________.com" # Need to enter URL

request = urllib.urlopen(url)

response = request.read()

print response