Untitled diff
10 removals
Words removed | 11 |
Total words | 520 |
Words removed (%) | 2.12 |
150 lines
10 additions
Words added | 10 |
Total words | 519 |
Words added (%) | 1.93 |
150 lines
#note: <meta content='Story' property='bb:resource_type'>
#note: <meta content='Story' property='bb:resource_type'>
import urllib2
import urllib2
import os
import os
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
from urlparse import urljoin
from urlparse import urljoin
class Spider:
class Spider:
links_to_crawl = []
links_to_crawl = []
crawled_links = []
crawled_links = []
ignored_links = ['/']
ignored_links = ['/']
domain = 'http://bloomberg.com/'
domain = 'http://bloomberg.com/'
#meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'
#meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'
# append all starting link to links_to_crawl
# append all starting link to links_to_crawl
def __init__(self, url):
def __init__(self, url):
print 'Spider initialising...'
print 'Spider initialising...'
self.links_to_crawl.append(url)
self.links_to_crawl.append(url)
# open input url and return html
# open input url and return html
def grab_html(self,url):
def grab_html(self,url):
open_url = self.urllib2.urlopen(url)
open_url = urllib2.urlopen(url)
data = open_url.read()
data = open_url.read()
open_url.close()
open_url.close()
return data
return data
# return title from input html for file naming and ensure
# return title from input html for file naming and ensure
# no '/' present in title.
# no '/' present in title.
def get_title(self, data=''):
def get_title(self, data=''):
title_start = data.find('<title>')+7
title_start = data.find('<title>')+7
title_end = data.find('</title>')-1
title_end = data.find('</title>')-1
title = data[title_start:title_end]
title = data[title_start:title_end]
title = title.translate(None, '/')
title = title.translate(None, '/')
return title+".txt"
return title+".txt"
# return date from input html for file saving structure
# return date from input html for file saving structure
def get_date(self, data=''):
def get_date(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
# try statement to avoid error when meta tag combinations
# try statement to avoid error when meta tag combinations
# not found.
# not found.
try:
try:
date = soup.find('meta', {'name':'pubdate'})['content']
date = soup.find('meta', {'name':'pubdate'})['content']
return date[:12] # !! only tested with bloomberg.com !!
return date[:12] # !! only tested with bloomberg.com !!
# if there is no published date, return 'Other'
# if there is no published date, return 'Other'
except TypeError:
except TypeError:
return 'Other'
return 'Other'
# if link is relative url return 'Rel' or
# if link is relative url return 'Rel' or
# if url is allowed domain return 'Abs', else False.
# if url is allowed domain return 'Abs', else False.
def url_type(self,url=''):
def url_type(self,url=''):
if url[0:4] != 'http':
if url[0:4] != 'http':
return 'Rel'
return 'Rel'
elif url.find(self.domain) != -1:
elif url.find(self.domain) != -1:
return 'Abs'
return 'Abs'
else:
else:
return False
return False
# reconstruct relative url
# reconstruct relative url
def reconstruct_url(self, page='', rel=''):
def reconstruct_url(self, page='', rel=''):
print page #debug
print page #debug
print rel #debug
print rel #debug
print self.urljoin(page, rel) #debug
print urljoin(page, rel) #debug
return self.urljoin(page, rel)
return urljoin(page, rel)
# get all links in input html and append to links_to_crawl
# get all links in input html and append to links_to_crawl
# unless in crawled_links or ignored_links
# unless in crawled_links or ignored_links
# if link is relative url reconstruct url and append to
# if link is relative url reconstruct url and append to
# links_to_crawl, append relative url to ignored_links
# links_to_crawl, append relative url to ignored_links
def get_links(self, data=''):
def get_links(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
for link in soup.find_all('a'):
# try statement to avoid error when finding
# try statement to avoid error when finding
# <a> tags withou 'href'
# <a> tags withou 'href'
try:
try:
if link['href'] in self.ignored_links or self.crawled_links:
if link['href'] in self.ignored_links or self.crawled_links:
pass
pass
else:
else:
if self.url_type(link['href'])=='Rel':
if self.url_type(link['href'])=='Rel':
reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
self.ignored_links.append(link['href']) # append original link to ignored_links
self.ignored_links.append(link['href']) # append original link to ignored_links
else:
else:
self.links_to_crawl.append(link['href'])
self.links_to_crawl.append(link['href'])
except KeyError:
except KeyError:
pass
pass
# if directory exists do nothing
# if directory exists do nothing
# if directory does not exist write directory
# if directory does not exist write directory
def ensure_dir(self, directory=''):
def ensure_dir(self, directory=''):
if self.os.path.exists(directory):
if os.path.exists(directory):
pass
pass
else:
else:
self.os.makedirs(directory)
os.makedirs(directory)
# ensure the html being saved is the type requested
# ensure the html being saved is the type requested
# currently only compatible with 1 meta type
# currently only compatible with 1 meta type
def ensure_meta_type(self, data=''):
def ensure_meta_type(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
try:
try:
soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
print 'True'
print 'True'
return True
return True
except TypeError:
except TypeError:
print 'False'
print 'False'
return False
return False
# save input html to txt file on mac os desktop and return
# save input html to txt file on mac os desktop and return
# absolute path to file
# absolute path to file
def save_html(self,data=''):
def save_html(self,data=''):
if self.ensure_meta_type(data):
if self.ensure_meta_type(data):
print 'SAVING URL'
print 'SAVING URL'
# allocate save path for file and ensure save path exists
# allocate save path for file and ensure save path exists
save_path = self.os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
save_path = os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
self.ensure_dir(save_path)
self.ensure_dir(save_path)
# get file name and write file to absolute path
# get file name and write file to absolute path
file_name = self.get_title(data)
file_name = self.get_title(data)
absolute_path = save_path+'/'+file_name
absolute_path = save_path+'/'+file_name
opened_file = open(absolute_path,'w')
opened_file = open(absolute_path,'w')
opened_file.write(data)
opened_file.write(data)
opened_file.close()
opened_file.close()
else:
else:
pass
pass
# crawl links_to_crawl and pop to crawled_links list
# crawl links_to_crawl and pop to crawled_links list
# if ValueError then pop to ignored_links
# if ValueError then pop to ignored_links
# except urllib2.URLError to avoid web crawler crawling
# except urllib2.URLError to avoid web crawler crawling
# non-url links
# non-url links
def crawl_links(self):
def crawl_links(self):
while len(self.links_to_crawl) > 0:
while len(self.links_to_crawl) > 0:
url = self.links_to_crawl[0]
url = self.links_to_crawl[0]
print url
print url
try:
try:
data = self.grab_html(url)
data = self.grab_html(url)
self.get_links(data)
self.get_links(data)
self.save_html(data)
self.save_html(data)
self.crawled_links.append(self.links_to_crawl.pop(0))
self.crawled_links.append(self.links_to_crawl.pop(0))
except (ValueError, self.urllib2.URLError):
except (ValueError, urllib2.URLError):
self.ignored_links.append(self.links_to_crawl.pop(0))
self.ignored_links.append(self.links_to_crawl.pop(0))
print 'Spider finished.'
print 'Spider finished.'
print 'Ignored links:'
print 'Ignored links:'
print self.ignored_links
print self.ignored_links
print 'Crawled links:'
print 'Crawled links:'
print self.crawled_links
print self.crawled_links
spider = Spider('http://www.bloomberg.com/news')
spider = Spider('http://www.bloomberg.com/news')
spider.crawl_links()
spider.crawl_links()