Here is a shitty huffington post scraper. Sorry my contribution is crap. Tis python. Left of some dics and lists, but its obvious. It find keywords too, but I left that list off as well.
from urllib import urlopen
import string
import re
##for left column of homepage
def h_p_left(url):
target_section=re.compile('<h3>(.*)</a></h3>') ##gets section (i.e link + title in HTML)
sections = target_section.findall(url) ##makes a list of all instances they occur
for i in range(0, 3):
kill_links(sections[i]) ##grabs link from section
kill_tittles(sections[i]) ## grabs tittle
get_text(links[i])##goes to link url, and scrapes content
news_dict[(tittles[i])] = (keywords, links[i], content[i]) ##stores in dict.
return news_dict
##same as above but the HTML is a bit different.
def h_p_center(url):
target_section=re.compile('<h4>(.*)</a></h4>')
sections=target_section.findall(url)
for i in range(0,3):
kill_links(sections[i])
kill_tittles(sections[i])
get_text(links[i])
news_dict[(tittles[i])] = (keywords, links[i], content[i])
##gets link sans HTML
def kill_links(text_in):
striped_link=re.compile('<a href="(.*?)".*>')
links.extend(striped_link.findall(text_in))
return links
##gets title sans HTML
def kill_tittles(text_in):
stripped_title=re.compile('>(.*)')
tittles.extend(stripped_tittle.findall(text_in))
return tittles
##scrapes content
def get_text(url):
articlepage=urlopen(url).read()##go to link URL
if articlepage.find('<div class="articleBody" itemprop="articleBody">') !=-1:##goes to text
front= articlepage.find('<div class="articleBody" itemprop="articleBody">')
else:
front = articlepage.find('<div class="entry_body_text">')##goes to start of text
back = articlepage.find('<>', front)##goes to end
text = articlepage[front:back]##grabs it
kill_tags = re.compile(r'<.*?>')##removes HTML tags within
k_t_list = kill_tags.findall(text)
for item in k_t_list:
text = re.sub(item, '', text)##makes it pretty
text=text.translate(string.maketrans("\n\t\r", " "))##makes it prettier
text= text.strip()##pretty
text = ' '.join(text.split())##pretty
content.append(text)##adds it to content
##please ignore below, unless you want to implement it. I probably did it a shitty way
##as in you make a huge freaking list with shit to avoid.
prekey={}
key={}
text = text.lower()
parts = text.split(' ')
for item in parts:
if item in prekey:
prekey[item] +=1
else:
prekey[item]=1
for item in prekey.keys():
if item in bad_words:
del prekey[item]
for item in prekey:
if prekey[item]>3:
key[item] = prekey[item]
for item in bad_words:
if item in key:
del key[item]
for item in key.keys():
keywords.append((item, key[item]))
return content, keywords
##the rest is simple, and repetitive
links=[]
tittles=[]
content=[]
keywords=[]
pwebpage=urlopen('http://www.huffingtonpost.com/politics').read()
h_p_left(pwebpage)
links=[]
content=[]
tittles=[]
keywords= []
news_dict={}
politics_dict={}
world_dict={}
h_p_center(pwebpage)
politics_dict=news_dict
news_dict={}
links=[]
tittles=[]
content=[]
keywords=[]
print "HP - p ; done"
bwebpage=urlopen('http://www.huffingtonpost.com/business').read()
h_p_left(bwebpage)
links=[]
tittles=[]
content=[]
keywords=[]
h_p_center(bwebpage)
business_dict=news_dict
news_dict={}
print ("HP - b; done")
wwebpage=urlopen('http://www.huffingtonpost.com/world').read()
links=[]
tittles=[]
content=[]
keywords=[]
h_p_left(wwebpage)
links=[]
tittles=[]
content=[]
keywords=[]
h_p_center(wwebpage)
world_dict=news_dict
print ("HP - w;done")
news_dict={}
Edited because I copy pasted incorrectly.