800 city names, grouped according to their first 4 letters:
Levenshtein-based distance
Original work by Philippe Rivière for Visionscarto.net. Comments and variants very welcome!
Forked from tsne world
forked from Fil‘s block: t-SNE with Levenshtein distances
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<script src="https://d3js.org/d3.v4.min.js"></script>
<style>
body {
margin: 0;
position: fixed;
top: 0;
right: 0;
bottom: 0;
left: 0;
font-family: monospace;
}
</style>
</head>
<body>
<script>
const width = 960,
height = 500,
margin = { top: 40, bottom: 40, left: 140, right: 140 },
scalepop = d3.scaleSqrt().domain([0, 100000]).range([0.2, 28]),
scalecountry = d3.scaleOrdinal(
d3.merge([d3.schemeCategory20b, d3.schemeCategory20c])),
centerx = d3.scaleLinear()
.range([margin.left, width - margin.right]),
centery = d3.scaleLinear()
.range([margin.top, height - margin.bottom]);
d3.csv('cities.csv', function (cities) {
const data = cities
.sort((a, b) => d3.descending(+a[2015], +b[2015]))
.map((d, i) => {
return {
lon: +d.Longitude,
lat: +d.Latitude,
name: d['Urban Agglomeration'],
r: scalepop(+d[2015]),
//color: scalecountry(+d['Country Code'])
};
})
.slice(0, 800);
/*
const canvas = d3.select("body").append("canvas")
.attr("width", width)
.attr("height", height);
*/
const svg = d3.select("body").append("svg")
.attr("width", width)
.attr("height", height);
// pos is the array of positions that will be updated by the tsne worker
// start with the geographic coordinates as is (plate-carrée)
// random or [0,0] is fine too
let pos = data.map(d => [d.lon, -d.lat]);
const forcetsne = d3.forceSimulation(
data.map(d => (d.x = width / 2, d.y = height / 2, d))
)
.alphaDecay(0.005)
.alpha(0.1)
.force('tsne', function (alpha) {
centerx.domain(d3.extent(pos.map(d => d[0])));
centery.domain(d3.extent(pos.map(d => d[1])));
data.forEach((d, i) => {
d.vx += alpha * (centerx(pos[i][0]) - d.x);
d.vy += alpha * (centery(pos[i][1]) - d.y);
d.color = scalecountry(pos[i][2]);
});
})
.force('collide', d3.forceCollide().radius(d => 1.5 + d.r))
.on('tick', function () {
// drawcanvas(canvas, data);
drawsvg(svg, data);
});
function drawcanvas(canvas, nodes) {
let context = canvas.node().getContext("2d");
context.clearRect(0, 0, width, width);
for (var i = 0, n = nodes.length; i < n; ++i) {
var node = nodes[i];
context.beginPath();
context.moveTo(node.x, node.y);
context.arc(node.x, node.y, node.r, 0, 2 * Math.PI);
context.lineWidth = 0.5;
context.fillStyle = node.color;
context.fill();
}
}
function drawsvg(svg, nodes) {
const g = svg.selectAll('g.city')
.data(nodes);
var enter = g.enter().append('g').classed('city', true);
enter.append('circle')
.attr('r', d => d.r)
//.attr('fill', d => d.color)
.append('title')
.text(d => d.name);
enter
.filter(d => d.r > 6)
.append('text')
.attr('fill', 'white')
.style('font-size', d => d.r > 9 ? '12px' : '9px')
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'middle')
.attr('pointer-events', 'none')
.text(d => d.name.substring(0,2));
g
.attr('fill', d => d.color)
.attr('transform', d => `translate(${d.x},${d.y})`);
}
d3.queue()
.defer(d3.text, 'tsne.js')
.defer(d3.text, 'worker.js')
.awaitAll(function (err, scripts) {
const worker = new Worker(
window.URL.createObjectURL(
new Blob(scripts, {
type: "text/javascript"
})
)
);
worker.postMessage({
maxIter: 10,
dim: 2,
perplexity: 30.0,
data: data
});
worker.onmessage = function (e) {
if (e.data.log) console.log.apply(this, e.data.log);
if (e.data.pos) pos = e.data.pos;
if (e.data.done && e.data.done < 10000 && e.data.cost > 1e-2) {
worker.postMessage({
maxIter: e.data.done + 10,
});
}
};
});
});
</script>
</body>
// create main global object
var tsnejs = tsnejs || { REVISION: 'ALPHA' };
(function(global) {
"use strict";
// utility function
var assert = function(condition, message) {
if (!condition) { throw message || "Assertion failed"; }
}
// syntax sugar
var getopt = function(opt, field, defaultval) {
if(opt.hasOwnProperty(field)) {
return opt[field];
} else {
return defaultval;
}
}
// return 0 mean unit standard deviation random number
var return_v = false;
var v_val = 0.0;
var gaussRandom = function() {
if(return_v) {
return_v = false;
return v_val;
}
var u = 2*Math.random()-1;
var v = 2*Math.random()-1;
var r = u*u + v*v;
if(r == 0 || r > 1) return gaussRandom();
var c = Math.sqrt(-2*Math.log(r)/r);
v_val = v*c; // cache this for next function call for efficiency
return_v = true;
return u*c;
}
// return random normal number
var randn = function(mu, std){ return mu+gaussRandom()*std; }
// utilitity that creates contiguous vector of zeros of size n
var zeros = function(n) {
if(typeof(n)==='undefined' || isNaN(n)) { return []; }
if(typeof ArrayBuffer === 'undefined') {
// lacking browser support
var arr = new Array(n);
for(var i=0;i<n;i++) { arr[i]= 0; }
return arr;
} else {
return new Float64Array(n); // typed arrays are faster
}
}
// utility that returns 2d array filled with random numbers
// or with value s, if provided
var randn2d = function(n,d,s) {
var uses = typeof s !== 'undefined';
var x = [];
for(var i=0;i<n;i++) {
var xhere = [];
for(var j=0;j<d;j++) {
if(uses) {
xhere.push(s);
} else {
xhere.push(randn(0.0, 1e-4));
}
}
x.push(xhere);
}
return x;
}
// compute L2 distance between two vectors
var L2 = function(x1, x2) {
var D = x1.length;
var d = 0;
for(var i=0;i<D;i++) {
var x1i = x1[i];
var x2i = x2[i];
d += (x1i-x2i)*(x1i-x2i);
}
return d;
}
// compute pairwise distance in all vectors in X
var xtod = function(X) {
var N = X.length;
var dist = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = L2(X[i], X[j]);
dist[i*N+j] = d;
dist[j*N+i] = d;
}
}
return dist;
}
// compute (p_{i|j} + p_{j|i})/(2n)
var d2p = function(D, perplexity, tol) {
var Nf = Math.sqrt(D.length); // this better be an integer
var N = Math.floor(Nf);
assert(N === Nf, "D should have square number of elements.");
var Htarget = Math.log(perplexity); // target entropy of distribution
var P = zeros(N * N); // temporary probability matrix
var prow = zeros(N); // a temporary storage compartment
for(var i=0;i<N;i++) {
var betamin = -Infinity;
var betamax = Infinity;
var beta = 1; // initial value of precision
var done = false;
var maxtries = 50;
// perform binary search to find a suitable precision beta
// so that the entropy of the distribution is appropriate
var num = 0;
while(!done) {
//debugger;
// compute entropy and kernel row with beta precision
var psum = 0.0;
for(var j=0;j<N;j++) {
var pj = Math.exp(- D[i*N+j] * beta);
if(i===j) { pj = 0; } // we dont care about diagonals
prow[j] = pj;
psum += pj;
}
// normalize p and compute entropy
var Hhere = 0.0;
for(var j=0;j<N;j++) {
if(psum == 0) {
var pj = 0;
} else {
var pj = prow[j] / psum;
}
prow[j] = pj;
if(pj > 1e-7) Hhere -= pj * Math.log(pj);
}
// adjust beta based on result
if(Hhere > Htarget) {
// entropy was too high (distribution too diffuse)
// so we need to increase the precision for more peaky distribution
betamin = beta; // move up the bounds
if(betamax === Infinity) { beta = beta * 2; }
else { beta = (beta + betamax) / 2; }
} else {
// converse case. make distrubtion less peaky
betamax = beta;
if(betamin === -Infinity) { beta = beta / 2; }
else { beta = (beta + betamin) / 2; }
}
// stopping conditions: too many tries or got a good precision
num++;
if(Math.abs(Hhere - Htarget) < tol) { done = true; }
if(num >= maxtries) { done = true; }
}
// console.log('data point ' + i + ' gets precision ' + beta + ' after ' + num + ' binary search steps.');
// copy over the final prow to P at row i
for(var j=0;j<N;j++) { P[i*N+j] = prow[j]; }
} // end loop over examples i
// symmetrize P and normalize it to sum to 1 over all ij
var Pout = zeros(N * N);
var N2 = N*2;
for(var i=0;i<N;i++) {
for(var j=0;j<N;j++) {
Pout[i*N+j] = Math.max((P[i*N+j] + P[j*N+i])/N2, 1e-100);
}
}
return Pout;
}
// helper function
function sign(x) { return x > 0 ? 1 : x < 0 ? -1 : 0; }
var tSNE = function(opt) {
var opt = opt || {};
this.perplexity = getopt(opt, "perplexity", 30); // effective number of nearest neighbors
this.dim = getopt(opt, "dim", 2); // by default 2-D tSNE
this.epsilon = getopt(opt, "epsilon", 10); // learning rate
this.iter = 0;
}
tSNE.prototype = {
// this function takes a set of high-dimensional points
// and creates matrix P from them using gaussian kernel
initDataRaw: function(X) {
var N = X.length;
var D = X[0].length;
assert(N > 0, " X is empty? You must have some data!");
assert(D > 0, " X[0] is empty? Where is the data?");
var dists = xtod(X); // convert X to distances using gaussian kernel
this.P = d2p(dists, this.perplexity, 1e-4); // attach to object
this.N = N; // back up the size of the dataset
this.initSolution(); // refresh this
},
// this function takes a given distance matrix and creates
// matrix P from them.
// D is assumed to be provided as a list of lists, and should be symmetric
initDataDist: function(D) {
var N = D.length;
assert(N > 0, " X is empty? You must have some data!");
// convert D to a (fast) typed array version
var dists = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = D[i][j];
dists[i*N+j] = d;
dists[j*N+i] = d;
}
}
this.P = d2p(dists, this.perplexity, 1e-4);
this.N = N;
this.initSolution(); // refresh this
},
// (re)initializes the solution to random
initSolution: function() {
// generate random solution to t-SNE
this.Y = randn2d(this.N, this.dim); // the solution
this.gains = randn2d(this.N, this.dim, 1.0); // step gains to accelerate progress in unchanging directions
this.ystep = randn2d(this.N, this.dim, 0.0); // momentum accumulator
this.iter = 0;
},
// return pointer to current solution
getSolution: function() {
return this.Y;
},
// perform a single step of optimization to improve the embedding
step: function() {
this.iter += 1;
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
// perform gradient step
var ymean = zeros(this.dim);
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var gid = grad[i][d];
var sid = this.ystep[i][d];
var gainid = this.gains[i][d];
// compute gain update
var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
if(newgain < 0.01) newgain = 0.01; // clamp
this.gains[i][d] = newgain; // store for next turn
// compute momentum step direction
var momval = this.iter < 250 ? 0.5 : 0.8;
var newsid = momval * sid - this.epsilon * newgain * grad[i][d];
this.ystep[i][d] = newsid; // remember the step we took
// step!
this.Y[i][d] += newsid;
ymean[d] += this.Y[i][d]; // accumulate mean so that we can center later
}
}
// reproject Y to be zero mean
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
this.Y[i][d] -= ymean[d]/N;
}
}
//if(this.iter%100===0) console.log('iter ' + this.iter + ', cost: ' + cost);
return cost; // return current cost
},
// for debugging: gradient check
debugGrad: function() {
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
var e = 1e-5;
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var yold = this.Y[i][d];
this.Y[i][d] = yold + e;
var cg0 = this.costGrad(this.Y);
this.Y[i][d] = yold - e;
var cg1 = this.costGrad(this.Y);
var analytic = grad[i][d];
var numerical = (cg0.cost - cg1.cost) / ( 2 * e );
console.log(i + ',' + d + ': gradcheck analytic: ' + analytic + ' vs. numerical: ' + numerical);
this.Y[i][d] = yold;
}
}
},
// return cost and gradient, given an arrangement
costGrad: function(Y) {
var N = this.N;
var dim = this.dim; // dim of output space
var P = this.P;
var pmul = this.iter < 100 ? 4 : 1; // trick that helps with local optima
// compute current Q distribution, unnormalized first
var Qu = zeros(N * N);
var qsum = 0.0;
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var dsum = 0.0;
for(var d=0;d<dim;d++) {
var dhere = Y[i][d] - Y[j][d];
dsum += dhere * dhere;
}
var qu = 1.0 / (1.0 + dsum); // Student t-distribution
Qu[i*N+j] = qu;
Qu[j*N+i] = qu;
qsum += 2 * qu;
}
}
// normalize Q distribution to sum to 1
var NN = N*N;
var Q = zeros(NN);
for(var q=0;q<NN;q++) { Q[q] = Math.max(Qu[q] / qsum, 1e-100); }
var cost = 0.0;
var grad = [];
for(var i=0;i<N;i++) {
var gsum = new Array(dim); // init grad for point i
for(var d=0;d<dim;d++) { gsum[d] = 0.0; }
for(var j=0;j<N;j++) {
cost += - P[i*N+j] * Math.log(Q[i*N+j]); // accumulate cost (the non-constant portion at least...)
var premult = 4 * (pmul * P[i*N+j] - Q[i*N+j]) * Qu[i*N+j];
for(var d=0;d<dim;d++) {
gsum[d] += premult * (Y[i][d] - Y[j][d]);
}
}
grad.push(gsum);
}
return {cost: cost, grad: grad};
}
}
global.tSNE = tSNE; // export tSNE class
})(tsnejs);
// export the library to window, or to module in nodejs
(function(lib) {
"use strict";
if (typeof module === "undefined" || typeof module.exports === "undefined") {
if (typeof window == "object")
window.tsnejs = lib; // in ordinary browser attach library to window
} else {
module.exports = lib; // in nodejs
}
})(tsnejs);
importScripts('https://d3js.org/d3.v4.min.js');
// in worker.onmessage add : `if (e.data.log) console.log.apply(this, e.data.log);`
console.log = function() {
self.postMessage({ log: [...arguments] });
}
function levenshtein(a, b) {
var t = [], u, i, j, m = a.length, n = b.length;
if (!m) { return n; }
if (!n) { return m; }
for (j = 0; j <= n; j++) { t[j] = j; }
for (i = 1; i <= m; i++) {
for (u = [i], j = 1; j <= n; j++) {
u[j] = a[i - 1] === b[j - 1] ? t[j - 1] : Math.min(t[j - 1], t[j], u[j - 1]) + 1;
} t = u;
} return u[n];
}
let iterations = 0,
model;
let K=18,
centers = d3.range(K).map(i => [ 0, 0 ]);
self.onmessage = function(e) {
const msg = e.data,
data = msg.data,
maxIter = msg.maxIter || 500,
perplexity = msg.perplexity || 30;
let dists = msg.dist;
if (data && !dists){
let now = performance.now();
dists = data.map(
a => data.map(
b => d3.range(4).map(n => levenshtein(a.name.substring(0,n).toLowerCase(), b.name.substring(0,n).toLowerCase()) * Math.pow(1/2, n))
.reduce((a,b)=>a+b, 0)
)
);
console.log('computed', data.length * data.length,'distances in', Math.round(performance.now()-now),'ms');
}
if (dists) {
model = new tsnejs.tSNE({
dim: msg.dim,
perplexity: perplexity,
});
model.initDataDist(dists);
}
let startpos = model.getSolution().map(d=>d.slice()),
pos;
while (iterations++ < maxIter) {
// every time you call this, solution gets better
model.step();
pos = model.getSolution();
// each K-center moves to the barycenter of its group
var m = pos[Math.floor(Math.random() * pos.length)];
centers.forEach((c,i) => {
let p = pos.filter(d => d[2] == i);
if (p.length < 10) c = m;
else {
c[0] = d3.mean(p.map(d => d[0]));
c[1] = d3.mean(p.map(d => d[1]));
}
});
// group each point according to its closest K-center
pos.forEach(d => {
d[2] = d3.scan(centers.map(c => {
var dx = d[0] - c[0],
dy = d[1] - c[1];
return (dx*dx + dy*dy);
}));
});
// Y is an array of 2-D points that you can plot
self.postMessage({
iterations: iterations - 1,
pos: pos,
//log: [ 'step', iterations-1 ]
});
}
let cost = startpos
.map((d,i) => sqdist(d, pos[i]))
.reduce ((a,b) => a + b, 0) / pos.length / Math.max(...pos.map(d => Math.abs(d[0]) + Math.abs(d[1])));
self.postMessage({
done: iterations - 1,
cost: cost,
log: [ 'done', iterations - 1 ]
});
};
function sqdist (a,b) {
let d = [a[0] - b[0], a[1] - b[1]].map(Math.abs);
return Math.max(d[0], d[1]);
}