block by aaizemberg 616552f12dce4c91e22fb6417af4e741

Las noticias del día 25/10/2018 (GoogleNews & Tableau)

Full Screen

index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width">
  <title>GoogleNews - 25/10/2018</title>
</head>
<body>
  <div class='tableauPlaceholder' id='viz1540498161350' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Go&#47;GoogleNews&#47;GoogleNews&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='GoogleNews&#47;GoogleNews' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Go&#47;GoogleNews&#47;GoogleNews&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1540498161350');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>  
</body>
</html>

googlenews_ar.py

# -*- coding: utf-8 -*-

import feedparser
import pandas as pd

lista = [{"letra":"n","categoria":"Nacional"},
         {"letra":"s","categoria":"Deportes"},
         {"letra":"m","categoria":"Salud"},
         {"letra":"t","categoria":"Ciencia y Tecnología"},
         {"letra":"e","categoria":"Entretenimiento"},
         {"letra":"b","categoria":"Economía"}]
data = []
for l in lista:  
  url = 'https://news.google.com/news?pz=1&cf=all&ned=es_ar&hl=es&topic=' + l['letra'] + '&output=rss'
  d = feedparser.parse( url )
  for e in d.entries:
    if(e!=d.entries[0]):
      titmed2 = e.title
      titmed = titmed2.split(' - ')
      titulo = titmed[0]
      medio = titmed[len(titmed)-1]
      fecha = e.published
      url2 = e.link      # esta URL tiene primero la parte de news.google y despues la del medio
      url = url2.split('&url=')[1]
      categoria = l['categoria']
      tabla = e.summary
      tbl = tabla.replace("</b>", "<b>").split('<b>')
      cantidad = "1"
      for r in tbl:
        if (u"artículos informativos&nbsp;&raquo;" in r):
          cantidad = r.split(' ')[1].replace(".","")      
      data.append([titulo, medio, fecha, url, categoria, int(cantidad)])

df = pd.DataFrame(data, columns=['Title', 'Source', 'Date', 'url', 'Category', 'Size'])
df = df.sort_values(by='Size', ascending=False)
df.to_csv('googlenews_ar.tsv', sep='\t', header=True, index=False, encoding='utf-8')