block by nitaku 7d9dbc2e801ca1f1de91

Data matching diagram

Full Screen

This experiment proposes a diagram for displaying the results of a data matching (aka record linkage) problem. In this kind of problems, two different datasets A and B are automatically compared, in order to find pairs of records that refer to the same real-world entity. We took the assumption of representing the results of a constrained matching problem, where the matching function is a one-to-one mapping (a bijective function from A to B). The implication is that the number of matches found in A is equal to the number of matches found in B.

The number of elements in A and B is represented by the length of two justaxposed, “misaligned” bars. A is depicted in brown-orange, while B in different shades of cyan. The number of matches is proportional to the length of the aligned portion of the bars (represented with more vivid colors). The remaining parts show the unmatched records in A (brown) and B (darker cyan).

The diagram could have been made more theoretically correct by aligning the two bars on the same y coordinate, thus keeping both the bars representing unmatched records to the right. This would have enabled a better evaluation of the quantities encoded in the diagram, making it easier to compare the amount of unmatched records found in the two datasets. However, we feel that our design presents a more intuitive depiction of a matching process, yielding a better metaphor than the theoretical approach, sort of a quantitative version of the classic two-sets Venn diagram.

Terminology about data matching tasks is mainly from Cohen et al. 2002.

index.js

(function() {
  var D, H, P, T, a, a_bbox, ab, b, b_bbox, base_hue, color_a, color_ab, color_b, color_ba, height, label_a, label_b, randint, svg, width, x;

  randint = function(min, max) {
    return Math.floor(Math.random() * (max - min)) + min;
  };

  a = randint(1000, 3000);

  b = randint(2000, 4000);

  ab = randint(3000, 5000);

  svg = d3.select('svg');

  width = svg.node().getBoundingClientRect().width;

  height = svg.node().getBoundingClientRect().height;

  svg.attr({
    viewBox: "" + (-width / 2) + " " + (-height / 2) + " " + width + " " + height
  });

  x = d3.scale.linear().domain([0, a + b + ab]).range([-width / 2 + 40, width / 2 - 40]);

  H = 40;

  P = 30;

  D = 6;

  T = 10;

  base_hue = 45;

  color_a = d3.hcl(base_hue, 50, 50);

  color_ab = d3.hcl(base_hue + 30, 55, 70);

  color_b = d3.hcl(base_hue + 180, 50, 50);

  color_ba = d3.hcl(base_hue + 180 - 30, 55, 70);

  svg.append('rect').attr({
    "class": 'bar',
    x: x(0),
    y: -H - P / 2,
    width: x(a) - x(0),
    height: H,
    fill: color_a
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a),
    y: -H - P / 2,
    width: x(a + ab) - x(a),
    height: H,
    fill: color_ab
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a),
    y: +P / 2,
    width: x(a + ab) - x(a),
    height: H,
    fill: color_ba
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a + ab),
    y: +P / 2,
    width: x(a + ab + b) - x(a + ab),
    height: H,
    fill: color_b
  });

  svg.append('text').text(d3.format(',')(a)).attr({
    "class": 'label',
    x: x(a / 2),
    dy: '0.35em'
  });

  svg.append('text').text(d3.format(',')(ab) + ' matches').attr({
    "class": 'label',
    x: x(a + ab / 2),
    dy: '0.35em'
  });

  svg.append('text').text(d3.format(',')(b)).attr({
    "class": 'label',
    x: x(a + ab + b / 2),
    dy: '0.35em'
  });

  svg.append('path').attr({
    "class": 'axis',
    d: "M" + (x(0)) + " " + (-H - P / 2 - D) + " l0 " + (-T) + " l" + (x(a + ab) - x(0)) + " 0 l0 " + T
  });

  svg.append('path').attr({
    "class": 'axis',
    d: "M" + (x(a)) + " " + (+H + P / 2 + D) + " l0 " + (+T) + " l" + (x(a + ab + b) - x(a)) + " 0 l0 " + (-T)
  });

  label_a = svg.append('text').text(d3.format(',')(a + ab) + ' (dataset A)').attr({
    "class": 'label',
    x: x((a + ab) / 2),
    y: -H - P / 2 - D - T,
    dy: '0.35em'
  });

  a_bbox = label_a.node().getBBox();

  svg.insert('rect', '.label:last-child').attr({
    "class": 'halo',
    x: a_bbox.x,
    y: a_bbox.y,
    width: a_bbox.width,
    height: a_bbox.height
  });

  label_b = svg.append('text').text(d3.format(',')(b + ab) + ' (dataset B)').attr({
    "class": 'label',
    x: x(a + (b + ab) / 2),
    y: +H + P / 2 + D + T,
    dy: '0.35em'
  });

  b_bbox = label_b.node().getBBox();

  svg.insert('rect', '.label:last-child').attr({
    "class": 'halo',
    x: b_bbox.x,
    y: b_bbox.y,
    width: b_bbox.width,
    height: b_bbox.height
  });

}).call(this);

index.html

<!DOCTYPE html>
<html>
	<head>
        <meta charset="utf-8">
        <meta name="description" content="Data matching diagram" />
        <title>Data matching diagram</title>
		<link type="text/css" href="index.css" rel="stylesheet"/>
        <script src="//d3js.org/d3.v3.min.js"></script>
	</head>
	<body>
        <svg height="500" width="960"></svg>
        <script src="index.js"></script>
	</body>
</html>

index.coffee

# data
randint = (min, max) -> Math.floor(Math.random()*(max-min))+min

a = randint(1000,3000)
b = randint(2000,4000)
ab = randint(3000,5000)

# setup
svg = d3.select('svg')
width = svg.node().getBoundingClientRect().width
height = svg.node().getBoundingClientRect().height

# translate the viewBox to have (0,0) at the center of the vis
svg
  .attr
    viewBox: "#{-width/2} #{-height/2} #{width} #{height}"
    
x = d3.scale.linear()
  .domain([0, a+b+ab])
  .range([-width/2+40, width/2-40])
  
H = 40
P = 30
D = 6
T = 10

base_hue = 45

color_a = d3.hcl(base_hue, 50, 50)
color_ab = d3.hcl(base_hue+30, 55, 70)
color_b = d3.hcl(base_hue+180, 50, 50)
color_ba = d3.hcl(base_hue+180-30, 55, 70)
  
# vis
svg.append('rect')
  .attr
    class: 'bar'
    x: x(0)
    y: -H-P/2
    width: x(a)-x(0)
    height: H
    fill: color_a
    
svg.append('rect')
  .attr
    class: 'bar'
    x: x(a)
    y: -H-P/2
    width: x(a+ab)-x(a)
    height: H
    fill: color_ab
    
svg.append('rect')
  .attr
    class: 'bar'
    x: x(a)
    y: +P/2
    width: x(a+ab)-x(a)
    height: H
    fill: color_ba
    
svg.append('rect')
  .attr
    class: 'bar'
    x: x(a+ab)
    y: +P/2
    width: x(a+ab+b)-x(a+ab)
    height: H
    fill: color_b
    
# labels
svg.append('text')
  .text(d3.format(',')(a))
  .attr
    class: 'label'
    x: x(a/2)
    dy: '0.35em'
    
svg.append('text')
  .text(d3.format(',')(ab) + ' matches')
  .attr
    class: 'label'
    x: x(a+ab/2)
    dy: '0.35em'
    
svg.append('text')
  .text(d3.format(',')(b))
  .attr
    class: 'label'
    x: x(a+ab+b/2)
    dy: '0.35em'
    
svg.append('path')
  .attr
    class: 'axis'
    d: "M#{x(0)} #{-H-P/2-D} l0 #{-T} l#{x(a+ab)-x(0)} 0 l0 #{T}"
    
svg.append('path')
  .attr
    class: 'axis'
    d: "M#{x(a)} #{+H+P/2+D} l0 #{+T} l#{x(a+ab+b)-x(a)} 0 l0 #{-T}"
    
label_a = svg.append('text')
  .text(d3.format(',')(a+ab) + ' (dataset A)')
  .attr
    class: 'label'
    x: x((a+ab)/2)
    y: -H-P/2-D-T
    dy: '0.35em'
    
a_bbox = label_a.node().getBBox();
svg.insert('rect', '.label:last-child')
  .attr
    class: 'halo'
    x: a_bbox.x
    y: a_bbox.y
    width: a_bbox.width
    height: a_bbox.height
    
label_b = svg.append('text')
  .text(d3.format(',')(b+ab) + ' (dataset B)')
  .attr
    class: 'label'
    x: x(a+(b+ab)/2)
    y: +H+P/2+D+T
    dy: '0.35em'
    
b_bbox = label_b.node().getBBox();
svg.insert('rect', '.label:last-child')
  .attr
    class: 'halo'
    x: b_bbox.x
    y: b_bbox.y
    width: b_bbox.width
    height: b_bbox.height

index.css

svg {
  background: white;
}
.bar {
  shape-rendering: crispEdges;
}
.a {
  fill: teal;
}
.ab {
  fill: blue;
}
.ba {
  fill: red;
}
.b {
  fill: orange;
}

.label {
  text-anchor: middle;
  font-family: sans-serif;
  font-size: 14px;
  fill: #333;
}

.axis {
  stroke: #CCC;
  stroke-width: 1;
  shape-rendering: crispEdges;
  fill: none;
}

.halo {
  fill: white;
  stroke: white;
  stroke-width: 12px;
  shape-rendering: crispEdges;
}