62148e12df0295e98998 by ramnathv

Kernel density estimation is a method of estimating the probability distribution of a random variable based on a random sample. In contrast to a histogram, kernel density estimation produces a smooth estimate. The smoothness can be tuned via the kernel’s bandwidth parameter. With the correct choice of bandwidth, important features of the distribution can be seen, while an incorrect choice results in undersmoothing or oversmoothing and obscured features.

This example shows a histogram and a kernel density estimation for times between eruptions of Old Faithful Geyser in Yellowstone National Park, taken from R’s faithful dataset. The data follow a bimodal distribution; short eruptions are followed by a wait time averaging about 55 minutes, and long eruptions by a wait time averaging about 80 minutes. In recent years, wait times have been increasing, possibly due to the effects of earthquakes on the geyser’s geohydrology.

This example is based on the version by Mike Bostock. It makes the parameters for changing histogram and kernel density plot parameters more accessible.

index.html

<!DOCTYPE html>
<meta charset="utf-8">
<title>Histogram and Kernel Density Plot</title>
<style>

<!-- based on //bl.ocks.org/mbostock/4341954 -->

body {
  font: 10px sans-serif;
}

.bar {
  fill: #bbb;
  shape-rendering: crispEdges;
}

.line {
  fill: none;
  stroke: #000;
  stroke-width: 1.5px;
}

.axis path,
.axis line {
  fill: none;
  stroke: #000;
  shape-rendering: crispEdges;
}

.y.axis path {
  display: none;
}

</style>
<body>
<script src="//d3js.org/d3.v3.min.js"></script>
<script>

// USER DEFINABLE VARIABLES START

var numHistBins = 10; // number of bins for the histogram
var calcHistBinsAutmoatic = true; // if true, the number of bins are calculated automatically and
// numHistBins is overwritten
var showKDP = true; // show the kernel density plot?
var bandwith = 4; // bandwith (smoothing constant) h of the kernel density estimator
var dataFN = "./faithful.json"; // the filename of the data to be visualized


// USER DEFINABLE VARIABLES END


var margin = {top: 20, right: 30, bottom: 30, left: 40},
    width = 960 - margin.left - margin.right,
    height = 500 - margin.top - margin.bottom;

// the x-scale parameters
var x = d3.scale.linear()
    .domain([30, 110])
    .range([0, width]);

// the y-scale parameters
var y = d3.scale.linear()
    .domain([0, .15])
    .range([height, 0]);

var xAxis = d3.svg.axis()
    .scale(x)
    .orient("bottom");

var yAxis = d3.svg.axis()
    .scale(y)
    .orient("left")
    .tickFormat(d3.format("%"));

var line = d3.svg.line()
    .x(function(d) { return x(d[0]); })
    .y(function(d) { return y(d[1]); });

// the histogram function
var histogram;

var svg = d3.select("body").append("svg")
    .attr("width", width + margin.left + margin.right)
    .attr("height", height + margin.top + margin.bottom)
  .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

// draw the background
svg.append("g")
    .attr("class", "x axis")
    .attr("transform", "translate(0," + height + ")")
    .call(xAxis)
  .append("text")
    .attr("class", "label")
    .attr("x", width)
    .attr("y", -6)
    .style("text-anchor", "end")
    .text("Time between Eruptions (min.)");

svg.append("g")
    .attr("class", "y axis")
    .call(yAxis);


// draw the histogram and kernel density plot	
d3.json(dataFN, function(error, faithful) {

    // calculate the number of histogram bins
	if( calcHistBinsAutmoatic == true) {
		 numHistBins = Math.ceil(Math.sqrt(faithful.length));  // global variable
	}
 // the histogram function
  histogram = d3.layout.histogram()
    .frequency(false)
    .bins(numHistBins);
	//.bins(x.ticks(500));
	
  var data = histogram(faithful);
  //var kde = kernelDensityEstimator(epanechnikovKernel(7), x.ticks(100));
  var kde = kernelDensityEstimator(epanechnikovKernel(bandwith), x.ticks(100));
  
  //alert("kde is " + kde.toSource());
  
  //console.log(svg.datum(kde(faithful)));

  svg.selectAll(".bar")
      .data(data)
    .enter().insert("rect", ".axis")
      .attr("class", "bar")
      .attr("x", function(d) { return x(d.x) + 1; })
      .attr("y", function(d) { return y(d.y); })
      .attr("width", x(data[0].dx + data[0].x) - x(data[0].x) - 1)
      .attr("height", function(d) { return height - y(d.y); });
	
	// show the kernel density plot
	if(showKDP == true) {
		svg.append("path")
		  .datum(kde(faithful))
		  .attr("class", "line")
		  .attr("d", line);
	  }

});

function kernelDensityEstimator(kernel, x) {
  return function(sample) {
    return x.map(function(x) {
		//console.log(x + " ... " + d3.mean(sample, function(v) { return kernel(x - v); }));		
		return [x, d3.mean(sample, function(v) { return kernel(x - v); })];
    });
  };
}

function epanechnikovKernel(bandwith) {
  return function(u) {
    //return Math.abs(u /= bandwith) <= 1 ? .75 * (1 - u * u) / bandwith : 0;
	if(Math.abs(u = u /  bandwith) <= 1) {
	 return 0.75 * (1 - u * u) / bandwith;
	} else return 0;
  };
}

	 

</script>

faithful.json

[79,54,74,62,85,55,88,85,51,85,54,84,78,47,83,52,62,84,52,79,51,47,78,69,74,83,55,76,78,79,73,77,66,80,74,52,48,80,59,90,80,58,84,58,73,83,64,53,82,59,75,90,54,80,54,83,71,64,77,81,59,84,48,82,60,92,78,78,65,73,82,56,79,71,62,76,60,78,76,83,75,82,70,65,73,88,76,80,48,86,60,90,50,78,63,72,84,75,51,82,62,88,49,83,81,47,84,52,86,81,75,59,89,79,59,81,50,85,59,87,53,69,77,56,88,81,45,82,55,90,45,83,56,89,46,82,51,86,53,79,81,60,82,77,76,59,80,49,96,53,77,77,65,81,71,70,81,93,53,89,45,86,58,78,66,76,63,88,52,93,49,57,77,68,81,81,73,50,85,74,55,77,83,83,51,78,84,46,83,55,81,57,76,84,77,81,87,77,51,78,60,82,91,53,78,46,77,84,49,83,71,80,49,75,64,76,53,94,55,76,50,82,54,75,78,79,78,78,70,79,70,54,86,50,90,54,54,77,79,64,75,47,86,63,85,82,57,82,67,74,54,83,73,73,88,80,71,83,56,79,78,84,58,83,43,60,75,81,46,90,46,74]