index.html
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<title>GoogleNews - 25/10/2018</title>
</head>
<body>
<div class='tableauPlaceholder' id='viz1540498161350' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/Go/GoogleNews/GoogleNews/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='GoogleNews/GoogleNews' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Go/GoogleNews/GoogleNews/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1540498161350'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
</body>
</html>
googlenews_ar.py
import feedparser
import pandas as pd
lista = [{"letra":"n","categoria":"Nacional"},
{"letra":"s","categoria":"Deportes"},
{"letra":"m","categoria":"Salud"},
{"letra":"t","categoria":"Ciencia y Tecnología"},
{"letra":"e","categoria":"Entretenimiento"},
{"letra":"b","categoria":"Economía"}]
data = []
for l in lista:
url = 'https://news.google.com/news?pz=1&cf=all&ned=es_ar&hl=es&topic=' + l['letra'] + '&output=rss'
d = feedparser.parse( url )
for e in d.entries:
if(e!=d.entries[0]):
titmed2 = e.title
titmed = titmed2.split(' - ')
titulo = titmed[0]
medio = titmed[len(titmed)-1]
fecha = e.published
url2 = e.link
url = url2.split('&url=')[1]
categoria = l['categoria']
tabla = e.summary
tbl = tabla.replace("</b>", "<b>").split('<b>')
cantidad = "1"
for r in tbl:
if (u"artículos informativos »" in r):
cantidad = r.split(' ')[1].replace(".","")
data.append([titulo, medio, fecha, url, categoria, int(cantidad)])
df = pd.DataFrame(data, columns=['Title', 'Source', 'Date', 'url', 'Category', 'Size'])
df = df.sort_values(by='Size', ascending=False)
df.to_csv('googlenews_ar.tsv', sep='\t', header=True, index=False, encoding='utf-8')