block by jfirebaugh 900762

900762

Full Screen

index.html

<html>
  <head>
    <title>Kernel Density Estimation</title>
    <script type="text/javascript" src="//vis.stanford.edu/protovis/protovis-r3.2.js"></script>
    <script type="text/javascript" src="kde.js"></script>
    <script type="text/javascript" src="faithful.js"></script>
	<style type="text/css">

	body {
	  margin: 0;
	}

	</style>
  </head>
  <body>
    <script type="text/javascript+protovis">

/* Sizing and scales. */
var w          = document.body.clientWidth - 40,
    h          = document.body.clientHeight - 25,
    x          = pv.Scale.linear(30, 110).range(0, w),
    y          = pv.Scale.linear(0, 0.1).range(0, h),
    bandwidths = [1, 7, 20];

/* The root panel. */
var vis = new pv.Panel()
    .width(w)
    .height(h)
    .bottom(20)
    .left(30)
    .right(10)
    .top(5);

/* X-axis ticks. */
vis.add(pv.Rule)
    .data(x.ticks())
    .left(x)
    .strokeStyle("#eee")
  .add(pv.Rule)
    .bottom(-5)
    .height(5)
    .strokeStyle("#000")
  .anchor("bottom").add(pv.Label)
    .text(x.tickFormat);

/* Y-axis ticks. */
vis.add(pv.Rule)
    .data(y.ticks(5))
    .bottom(y)
    .strokeStyle("#eee")
  .anchor("left").add(pv.Label)
    .text(y.tickFormat);

/* Histogram */
vis.add(pv.Bar)
    .data(pv.histogram(faithful).frequency(false).bins(x.ticks(60)))
    .bottom(0)
    .left(function(d) x(d.x))
    .width(function(d) x(d.dx + 30))
    .height(function(d) y(d.y))
    .fillStyle("#aaa")
    .strokeStyle("rgba(255, 255, 255, .2)")
    .lineWidth(1);

/* Kernel density estimates */
var panel = vis.add(pv.Panel)
    .data(bandwidths);

var line = panel.add(pv.Line)
    .data(function(h) kde(faithful).scale(h).points(pv.range(30, 110, 0.1)))
    .left(function(d) x(d.x))
    .bottom(function(d) y(d.y));

panel.add(pv.Dot)
     .shape("square")
     .lineWidth(0)
     .fillStyle(function() line.strokeStyle())
     .top(function () this.parent.index * 12 + 10)
     .left(10)
   .anchor("right").add(pv.Label)
     .text(function (h) "h = " + h);

vis.render();

    </script>
  </body>
</html>

faithful.js

var faithful = [
  79, 54, 74, 62, 85, 55, 88, 85, 51, 85, 54, 84, 78, 47, 83, 52, 62, 84, 52, 79, 51, 47, 78, 69, 74, 83, 55, 76, 78, 79, 73, 77, 66, 80, 74, 52, 48, 80, 59, 90, 80, 58, 84, 58, 73, 83, 64, 53,
  82, 59, 75, 90, 54, 80, 54, 83, 71, 64, 77, 81, 59, 84, 48, 82, 60, 92, 78, 78, 65, 73, 82, 56, 79, 71, 62, 76, 60, 78, 76, 83, 75, 82, 70, 65, 73, 88, 76, 80, 48, 86, 60, 90, 50, 78, 63, 72,
  84, 75, 51, 82, 62, 88, 49, 83, 81, 47, 84, 52, 86, 81, 75, 59, 89, 79, 59, 81, 50, 85, 59, 87, 53, 69, 77, 56, 88, 81, 45, 82, 55, 90, 45, 83, 56, 89, 46, 82, 51, 86, 53, 79, 81, 60, 82, 77,
  76, 59, 80, 49, 96, 53, 77, 77, 65, 81, 71, 70, 81, 93, 53, 89, 45, 86, 58, 78, 66, 76, 63, 88, 52, 93, 49, 57, 77, 68, 81, 81, 73, 50, 85, 74, 55, 77, 83, 83, 51, 78, 84, 46, 83, 55, 81, 57,
  76, 84, 77, 81, 87, 77, 51, 78, 60, 82, 91, 53, 78, 46, 77, 84, 49, 83, 71, 80, 49, 75, 64, 76, 53, 94, 55, 76, 50, 82, 54, 75, 78, 79, 78, 78, 70, 79, 70, 54, 86, 50, 90, 54, 54, 77, 79, 64,
  75, 47, 86, 63, 85, 82, 57, 82, 67, 74, 54, 83, 73, 73, 88, 80, 71, 83, 56, 79, 78, 84, 58, 83, 43, 60, 75, 81, 46, 90, 46, 74
];

kde.js

function kde(sample) {
  /* Epanechnikov kernel */
  function epanechnikov(u) {
    return Math.abs(u) <= 1 ? 0.75 * (1 - u*u) : 0;
  };

  var kernel = epanechnikov;
  return {
    scale: function(h) {
      kernel = function (u) { return epanechnikov(u / h) / h; };
      return this;
    },

    points: function(points) {
      return points.map(function(x) {
        var y = pv.sum(sample.map(function (v) {
          return kernel(x - v);
        })) / sample.length;
        return {x: x, y: y};
      });
    }
  }
}

readme.mkd

A [kernel density estimate](http://en.wikipedia.org/wiki/Kernel_density_estimation) provides a means of estimating and visualizing the probability distribution function of a random variable based on a random sample. In contrast to a histogram, a kernel density estimate provides a smooth estimate, via the effect of a smoothing parameter called the _bandwidth_, here denoted by _h_. With the correct choice of bandwidth, important features of the distribution can be seen; an incorrect choice will result in undersmoothing or oversmoothing and obscure those features.

Here we see a histogram and three kernel density estimates for a sample of waiting times in minutes between eruptions of [Old Faithful Geyser](http://en.wikipedia.org/wiki/Old_Faithful) in Yellowstone National Park, taken from R's [`faithful`](http://stat.ethz.ch/R-manual/R-patched/library/datasets/html/faithful.html) dataset. The data follow a [bimodal distribution](http://en.wikipedia.org/wiki/Bimodal_distribution); short eruptions are followed by a wait time averaging about 55 minutes, and long eruptions by a wait time averaging about 80 minutes. In recent years, wait times have been increasing, possibly due to the effects of earthquakes on the geyser's geohydrology.