First pass at a simple point clustering algo for plpgsql by andrewxhill

point_cluster.sql

CREATE TYPE axh_cdb_row_clusters (
  cartodb_id   int,
  cluster_id   int,
  size    int
);

CREATE OR REPLACE FUNCTION axh_cdb_row_cluster(_t text, zoom INT)
  RETURNS SETOF axh_cdb_row_clusters AS $$
DECLARE
  sql TEXT;
  cluster_n INT := 1;
  clustered INT[];
  clusterA INT[];
  clusterB INT[];
  history INT[];
  seed INT;
  rows INT;
  alen INT;
  marker INT;
  resolution INT := 10;
  resolution_v INT;
  ret axh_cdb_row_clusters;
BEGIN
   -- Get the total number of rows we are looking at
   sql := 'SELECT count(*) FROM (' || _t || ') t';
   EXECUTE sql INTO rows;
   
   -- Get the first value to start with
   WHILE COALESCE(array_length(clustered, 1),0) < rows LOOP
	   EXECUTE 'SELECT min(cartodb_id) FROM (' || _t || ') t WHERE ($1 IS NULL OR NOT (cartodb_id = ANY ($1)))' INTO seed USING clustered;
	   -- Get the first cluster
	   sql := 'SELECT array_agg(cartodb_id) FROM (' || _t || ') t WHERE ST_Distance(the_geom::geography, (SELECT the_geom::geography FROM (' || _t || ') g WHERE cartodb_id = ' || seed || ')) < '|| resolution ||' * CDB_XYZ_Resolution( '|| zoom || ')';
	   EXECUTE sql INTO clusterA;

	   -- Reset variable resolution
	   resolution_v := resolution;

	   alen := array_length(clusterA, 1); 

	   -- Evaluate the cluster
	   WHILE alen > resolution_v LOOP
	   	   --Extend the resolution
	   	   resolution_v := LEAST(resolution_v + resolution, resolution_v + (alen / resolution_v));

		   EXECUTE 'SELECT array_agg(cartodb_id) FROM (' || _t || ') t WHERE NOT (cartodb_id = ANY ($1)) AND ST_Distance(the_geom::geography, (SELECT the_geom::geography FROM (' || _t || ') g WHERE cartodb_id = ' || seed || ')) < '|| resolution_v ||' * CDB_XYZ_Resolution( '|| zoom || ')' INTO clusterB USING clusterA;

	   	   alen := array_length(clusterB, 1); 
		   clusterA = array_cat(clusterA, clusterB);
	   END LOOP;

	   SELECT seed cartodb_id, cluster_n cluster_id, array_length(clusterA, 1) size INTO ret.cartodb_id, ret.cluster_id, ret.size;
	   RETURN NEXT ret;
	   cluster_n := cluster_n + 1; 
	   clustered = array_cat(clustered, clusterA);
	END LOOP;
   RETURN;
END
$$ language plpgsql;