openvis tweets #4.5: data crunch by sxywu

index.html

<!DOCTYPE html>
<head>
  <meta charset="utf-8">
  <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
  <script src='https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.11.2/lodash.js'></script>
  <link href='https://fonts.googleapis.com/css?family=Lora' rel='stylesheet' type='text/css'>
  
  <style>
    body {
      font-family: 'Lora', serif;
      margin:0;
      color: #49438C;
    }
    #main {
      width: 1000px;
    }
    #left, #right {
    	width: 45%;
      display: inline-block;
      padding: 15px;
      vertical-align: top;
      
    }
    #left div, #right div {
      display: inline-block;
      padding: 5px;
    }
    
  </style>
</head>

<body>
  <div id='main'>
    <div id='left'></div>
    <div id='right'></div>
  </div>

  <script>
  var startDate = new Date('2016-04-25T00:00:00-04:00');
  var endDate = new Date('2016-04-27T00:00:00-04:00');
  var dateFormat = d3.time.format('%x %I:%M%p');
  // 100 most common words, taken from https://gist.github.com/gravitymonkey/2406023
  // thank you gravitymonkey you beautiful person.
  var commonWords = ["the","of","and","a","to","in","is","you","that","it","he","was","for","on","are","as","with","his","they","I","at","be","this","have","from","or","one","had","by","word","but","not","what","all","were","we","when","your","can","said","there","use","an","each","which","she","do","how","their","if","will","up","other","about","out","many","then","them","these","so","some","her","would","make","like","him","into","time","has","look","two","more","write","go","see","number","no","way","could","people","my","than","first","water","been","call","who","oil","its","now","find","long","down","day","did","get","come","made","may","part"];
  var customWords = ["openvisconf", "talk", "me", "here", "im", "very", "just", "too", "really", "much", "our", "us", "most", "another", "off", "should", "cant", "via", "going", "dont", "also", "says", "always", "after", "such", "check", "need", "keep", "say", "any", "hey", "between", "–", "over", "강남풀싸롱", "available", "gt",  "got", "still", "lots", "being", "seen", "looks", "free", "am", "users", "take", "tiny", "own", "before", "big", "england", "back", "ive", "everyone", "super", "maybe", "stuff", "even", "lot", "make", "last", "open", "through", "something", "httpstcozc0ps1kc8h"];
    
  commonWords = _.chain(commonWords)
  	.union(customWords)
  	.reduce(function(obj, word) {
      word = word.toLowerCase();
      obj[word] = 1;
      return obj
    }, {}).value();
  var translations = {
    "datavis": "dataviz",
    "viz": "vis",
    "charts": "chart",
    "tools": "tool",
    "things": "thing",
    "visualizations": "visualization",
    "using": "use",
    "making": "make",
    "slides": "slide",
    "talks": "talk",
    "learning": "learn",
    "visualizing": "visualize",
    "showing": "show",
    "looking": "look",
    "talking": "talk",
    "thanks": "thank",
    "tweets": "tweet",
    "working": "work",
    "maps": "map",
    "thinking": "think",
    "speakers": "speaker",
    "friends": "friend",
    "d3js": "d3",
    "days": "day",
    "folks": "folk"
  }
    
  d3.json('tweets.json', function(tweets) {
    tweets = _.chain(tweets)
      .filter(tweet => {
        tweet.date = new Date(tweet.postedTime);
        return !tweet.body.match(/^RT/) &&
          startDate <= tweet.date && tweet.date <= endDate;
      }).sortBy(tweet => -tweet.date)
      .reduce((obj, tweet) => {
        obj[tweet.link] = tweet;
        return obj;
      }, {})
      .value();
    
    var words = {};
    _.each(tweets, function(tweet, key) {
      var username = tweet.actor.preferredUsername;
      _.each(tweet.body.split(' '), function(word) {
        word = word.toLowerCase()
          .replace(/[.,\/#!$%\^&\*;:{}=\-_`~()'|+]/g,"").replace(/\s/g, "");
        // if word is a mention, or one of the 100 most common words
        if (!word || word.startsWith('@') || commonWords[word] || parseInt(word)) return;
        // also translate some words to a more common form
        word = translations[word] || word;
        
        var wordObj = words[word];
        if (!wordObj) {
          wordObj = words[word] = {
            text: word,
            count: 0,
            tweets: {},
            users: {}
          };
        }
        
        if (!wordObj.tweets[key]) {
          wordObj.count += 1;
          wordObj.tweets[key] = 1;
          wordObj.users[username] = 1;
        }
      });
    });
    
    words = _.chain(words)
    	.sortBy(function(word) {
      	return -word.count;
      }).take(100).value();
    
    // now find the words closely correlated with each other
    var filteredTweets = {};
    var wordsByTweets = {};
    _.each(words, function(word) {
      _.each(word.tweets, function(val, tweet) {
        var wBT = wordsByTweets[tweet];
        if (!wBT) {
          wBT = wordsByTweets[tweet] = {};
        }
        wBT[word.text] = 1;
        
        filteredTweets[tweet] = tweets[tweet];
      });
    });
    
    var correlations = {};
    _.each(wordsByTweets, function(words, tweet) {
      // this is so inefficient hahaha yay
      _.each(words, function(val, word1) {
        _.each(words, function(val, word2) {
          if (word1 !== word2) {
						var key = [word1, word2].sort().join(',');
            if (!correlations[key]) {
              correlations[key] = {
              	count: 0,
                source: word1,
                target: word2,
                tweets: {}
              };
            }
            
            if (!correlations[key].tweets[tweet]) {            
              correlations[key].count += 1;
              correlations[key].tweets[tweet] = 1;
            }
          }
        });
      });
    });
    
    correlations = _.chain(correlations)
    	.filter(function(correlation) {return correlation.count > 1})
    	.sortBy(function(correlation) {return -correlation.count})
    	.value();
    
    var left = d3.select('#left');
    var right = d3.select('#right');
    left.append('h1')
    	.text('top 100 words');
    right.append('h1')
    	.text('commonly correlated words');
    left.selectAll('div')
    	.data(words)
      .enter().append('div')
    		.text(function(d) {return d.text});
    right.selectAll('div')
    	.data(correlations)
    	.enter().append('div')
    		.text(function(d) {return d.source + ',' + d.target});
    
    _.each(words, function(word) {
    	word.tweets = _.keys(word.tweets); 
      word.users = _.keys(word.users);
    });
    _.each(correlations, function(correlation) {
      correlation.tweets = _.keys(correlation.tweets);
    });
    _.each(wordsByTweets, function(words, tweet) {
      wordsByTweets[tweet] = _.keys(words);
    });
    
//     console.log(JSON.stringify(filteredTweets));
    console.log(JSON.stringify(words))
//     console.log(JSON.stringify(correlations));
//     console.log(JSON.stringify(wordsByTweets));
//     console.log(_.size(filteredTweets));
    	
  });
  </script>
</body>