This experiment tries to define and show the concepts of intersection and difference between two word clouds.
Two distinct documents A and B (the first about infovis and datavis, and the other about Human-Computer Interaction) are each used to compute word frequencies. Each word in common between the two documents is then used to create the intersection set, in which each word is assigned the minimum value among its original weights (something reminiscent of a fuzzy intersection operation). The remaining words are used to create the difference sets. For each word in common, the set which had the greatest weight also retains that word with a weight subtracted of the intersection weight mentioned above. In pseudocode:
for each word in common between A and B
let a_w be the weight of the word in A
let b_w be the weight of the word in B
put the word into the intersection set with weight = min(a_w, b_w)
if a_w - min(a_w, b_w) > 0
put the word into the A \ B set with weight = a_w - min(a_w, b_w)
else if b_w - min(a_w, b_w) > 0
put the word into the B \ A set with weight = b_w - min(a_w, b_w)
for each remaining word in A
put the word into the A \ B set with weight = a_w (original weight)
for each remaining word in B
put the word into the B \ A set with weight = b_w (original weight)
It can be seen by the three resulting “clouds” that some words can be interpreted as more specific to the infovis-datavis world (e.g. data, visualization, graphic, arts), while others are more used in HCI (e.g. user, computer, system). There’s also a fair amount of intersection between the two (e.g. information, design, research, field). Please note that the size of the intersection set is heavily influenced by the choice of removing stopwords.
While not completely satisfactory in its meaningfulness (as it’s often the case with word clouds), this experiment could lead to an interesting research path. Many different formulas can be used to define a concept of intersection (something that normalizes on document length, or even something inspired by TF-IDF could be interesting), and many choices are available for representing the results (a round, Venn-like layout could be nice).
This example employed a treemap word cloud layout, the useful nlp_compromise javascript library, and a precomputed list of English stopwords.
// Generated by CoffeeScript 1.4.0
(function() {
var color, correct_x, correct_y, height, svg, treemap, vis, width, zoom, zoomable_layer;
svg = d3.select('svg');
width = svg.node().getBoundingClientRect().width;
height = svg.node().getBoundingClientRect().height;
treemap = d3.layout.treemap().size([width, height]).value(function(node) {
return node.count;
}).sort(function(a, b) {
if (a.name === 'a' || b.name === 'b') {
return +1;
}
if (a.name === 'b' || b.name === 'a') {
return -1;
}
return a.count - b.count;
}).ratio(1 / 3).padding(function(node) {
if (node.depth === 0) {
return [0, 0, 40, 0];
} else if (node.depth === 1) {
return 4;
} else {
return 0;
}
}).round(false);
correct_x = d3.scale.linear().domain([0, width]).range([0, width * 1.05]);
correct_y = d3.scale.linear().domain([0, height]).range([0, height * 3 / 4]);
color = function(txt, set) {
var iset, noise;
iset = {
'a': 0,
'intersection': 1,
'b': 2
}[set];
Math.seedrandom(txt + 'abcdef');
noise = function(W) {
return Math.random() * W - W / 2;
};
return d3.hcl(iset * 90 + noise(90), 40, 50);
};
svg.attr({
viewBox: "" + (-width / 2) + " " + (-height / 2) + " " + width + " " + height
});
zoomable_layer = svg.append('g');
zoom = d3.behavior.zoom().scaleExtent([1, 10]).on('zoom', function() {
return zoomable_layer.attr({
transform: "translate(" + (zoom.translate()) + ")scale(" + (zoom.scale()) + ")"
});
});
svg.call(zoom);
vis = zoomable_layer.append('g').attr({
transform: "translate(" + (-width / 2) + "," + (-height / 2) + ")"
});
d3.csv('english_stopwords_long.txt', function(stopwords_array) {
var stopwords;
stopwords = {};
stopwords_array.forEach(function(w) {
return stopwords[w.word] = true;
});
return d3.text('infovis.txt', function(infovis_txt) {
var data_a, index_a;
data_a = nlp.ngram(infovis_txt, {
min_count: 1,
max_size: 1
})[0].filter(function(w) {
return !(w.word in stopwords);
});
index_a = {};
data_a.forEach(function(d) {
return index_a[d.word] = d;
});
return d3.text('hci.txt', function(hci_txt) {
var a, b, data_b, diff_a, diff_b, enter_labels, index_b, intersection, labels, nodes_data, tree;
data_b = nlp.ngram(hci_txt, {
min_count: 1,
max_size: 1
})[0].filter(function(w) {
return !(w.word in stopwords);
});
index_b = {};
data_b.forEach(function(d) {
return index_b[d.word] = d;
});
diff_a = data_a.filter(function(a) {
return !(a.word in index_b);
});
diff_b = data_b.filter(function(b) {
return !(b.word in index_a);
});
intersection = [];
data_a.forEach(function(a) {
return data_b.forEach(function(b) {
var min;
if (a.word === b.word) {
min = Math.min(a.count, b.count);
intersection.push({
word: a.word,
count: min
});
if (a.count - min > 0) {
diff_a.push({
word: a.word,
count: a.count - min
});
}
if (b.count - min > 0) {
return diff_b.push({
word: b.word,
count: b.count - min
});
}
}
});
});
a = {
children: diff_a.filter(function(d) {
return d.count > 1;
}),
name: "a"
};
intersection = {
children: intersection.filter(function(d) {
return d.count > 1;
}),
name: "intersection"
};
b = {
children: diff_b.filter(function(d) {
return d.count > 1;
}),
name: "b"
};
tree = {
children: [a, intersection, b],
name: "root"
};
nodes_data = treemap.nodes(tree);
labels = vis.selectAll('.label').data(nodes_data.filter(function(node) {
return node.depth === 2;
}));
enter_labels = labels.enter().append('svg').attr({
"class": 'label'
});
enter_labels.append('text').text(function(node) {
return node.word.toUpperCase();
}).attr({
dy: '0.35em',
fill: function(node) {
return color(node.word, node.parent.name);
}
}).each(function(node) {
var bbox, bbox_aspect, node_bbox, node_bbox_aspect, rotate;
bbox = this.getBBox();
bbox_aspect = bbox.width / bbox.height;
node_bbox = {
width: node.dx,
height: node.dy
};
node_bbox_aspect = node_bbox.width / node_bbox.height;
rotate = bbox_aspect >= 1 && node_bbox_aspect < 1 || bbox_aspect < 1 && node_bbox_aspect >= 1;
node.label_bbox = {
x: bbox.x + (bbox.width - correct_x(bbox.width)) / 2,
y: bbox.y + (bbox.height - correct_y(bbox.height)) / 2,
width: correct_x(bbox.width),
height: correct_y(bbox.height)
};
if (rotate) {
node.label_bbox = {
x: node.label_bbox.y,
y: node.label_bbox.x,
width: node.label_bbox.height,
height: node.label_bbox.width
};
return d3.select(this).attr('transform', 'rotate(-90)');
}
});
enter_labels.attr({
x: function(node) {
return node.x;
},
y: function(node) {
return node.y;
},
width: function(node) {
return node.dx;
},
height: function(node) {
return node.dy;
},
viewBox: function(node) {
return "" + node.label_bbox.x + " " + node.label_bbox.y + " " + node.label_bbox.width + " " + node.label_bbox.height;
},
preserveAspectRatio: 'none'
});
vis.append('text').text('A ∖ B').attr({
"class": 'set_label',
x: a.x + a.dx / 2,
y: height - 22,
dy: '0.35em'
});
vis.append('text').text('A ∩ B').attr({
"class": 'set_label',
x: intersection.x + intersection.dx / 2,
y: height - 22,
dy: '0.35em'
});
return vis.append('text').text('B ∖ A').attr({
"class": 'set_label',
x: b.x + b.dx / 2,
y: height - 22,
dy: '0.35em'
});
});
});
});
}).call(this);
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<script src="//davidbau.com/encode/seedrandom-min.js"></script>
<script src="//d3js.org/d3.v3.min.js"></script>
<script src="nlp.js"></script>
<link rel="stylesheet" type="text/css" href="index.css">
<title>Word cloud intersection</title>
</head>
<body>
<svg width="960px" height="500px"></svg>
<script src="index.js"></script>
</body>
</html>
word
a
about
above
after
again
against
all
am
an
and
any
are
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can't
cannot
could
couldn't
did
didn't
do
does
doesn't
doing
don't
down
during
each
few
for
from
further
had
hadn't
has
hasn't
have
haven't
having
he
he'd
he'll
he's
her
here
here's
hers
herself
him
himself
his
how
how's
i
i'd
i'll
i'm
i've
if
in
into
is
isn't
it
it's
its
itself
let's
me
more
most
mustn't
my
myself
no
nor
not
of
off
on
once
only
or
other
ought
our
ours
ourselves
out
over
own
same
shan't
she
she'd
she'll
she's
should
shouldn't
so
some
such
than
that
that's
the
their
theirs
them
themselves
then
there
there's
these
they
they'd
they'll
they're
they've
this
those
through
to
too
under
until
up
very
was
wasn't
we
we'd
we'll
we're
we've
were
weren't
what
what's
when
when's
where
where's
which
while
who
who's
whom
why
why's
with
won't
would
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
word
will
one
two
three
four
five
six
seven
eight
nine
ten
-
a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
aren't
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
couldn't
d
date
did
didn't
do
does
doesnt
doesn't
doing
done
dont
don't
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
evenly
ever
every
everybody
everyone
everything
everywhere
except
f
far
few
ff
following
follows
for
former
formerly
found
from
further
furthermore
g
gave
get
gets
getting
give
given
gives
giving
go
goes
gone
got
gotten
h
had
happens
hardly
has
hasnt
hasn't
have
havent
haven't
having
he
hed
he'd
he'll
hence
her
here
hereafter
hereby
herein
heres
hereupon
hers
herself
hes
he's
hi
him
himself
his
how
how's
howbeit
however
i
id
i'd
ie
if
i'll
im
i'm
immediate
immediately
in
inc
indeed
index
instead
into
inward
is
isnt
isn't
it
itd
it'd
itll
it'll
its
it's
itself
ive
i've
j
just
k
keep
keeps
kept
know
known
knows
l
largely
last
lately
later
latter
latterly
least
less
lest
let
lets
like
liked
likely
line
little
'll
look
looking
looks
ltd
m
made
mainly
make
makes
many
may
maybe
me
mean
means
meantime
meanwhile
merely
mg
might
million
miss
ml
more
moreover
most
mostly
mr
mrs
much
mug
must
my
myself
n
na
name
namely
nay
nd
near
nearly
necessarily
necessary
need
needs
neither
never
nevertheless
new
next
no
nobody
non
none
nonetheless
noone
nor
normally
nos
not
noted
nothing
now
nowhere
o
obviously
of
off
often
oh
ok
okay
on
once
ones
one's
only
onto
or
ord
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
owing
own
p
part
particular
particularly
past
per
perhaps
please
plus
possible
possibly
potentially
pp
previously
primarily
probably
promptly
put
q
que
quickly
quite
qv
r
rather
rd
re
're
readily
really
recent
recently
ref
refs
regarding
regardless
regards
related
relatively
respectively
resulted
resulting
retweet
rt
s
's
said
same
saw
say
saying
says
sec
seem
seemed
seeming
seems
seen
self
selves
sent
several
shall
she
she'd
she'll
shes
she's
should
shouldn't
showed
shown
showns
shows
significant
significantly
similar
similarly
since
slightly
so
some
somebody
somehow
someone
somethan
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specifically
specified
specify
specifying
still
stop
strongly
sub
substantially
successfully
such
sufficiently
sup
'sup
sure
t
take
taken
taking
tell
tends
th
than
thank
thanks
thanx
that
that'll
thats
that's
that've
the
their
theirs
them
themselves
then
thence
there
thereafter
thereby
thered
there'd
therefore
therein
there'll
thereof
therere
there're
theres
there's
thereto
thereupon
there've
these
they
theyd
they'd
they'll
theyre
they're
theyve
they've
think
thinks
this
those
thou
though
thoughh
thousand
throug
through
throughout
thru
thus
til
to
together
too
took
tooks
toward
towards
tried
tries
truly
try
trying
ts
twice
u
un
under
unfortunately
unless
unlike
unlikely
until
unto
up
upon
ups
us
use
used
useful
usefully
usefulness
uses
using
usually
v
value
various
've
very
via
viz
vol
vols
vs
w
want
wants
was
wasnt
wasn't
way
we
wed
we'd
welcome
we'll
well
went
were
we're
weren't
we've
what
whatever
what'll
whats
what's
when
whence
whenever
where
whereafter
whereas
whereby
wherein
wheres
where's
whereupon
wherever
whether
which
while
whim
who
whod
who'd
whoever
whole
who'll
whom
whomever
whos
who's
whose
why
widely
willing
wish
with
within
without
won't
words
world
would
wouldn't
x
y
yes
yet
you
youd
you'd
youll
you'll
your
you're
youre
yours
yourself
yourselves
youve
you've
z
svg = d3.select('svg')
width = svg.node().getBoundingClientRect().width
height = svg.node().getBoundingClientRect().height
treemap = d3.layout.treemap()
.size([width, height])
.value((node) -> node.count)
.sort((a,b) ->
return +1 if a.name is 'a' or b.name is 'b'
return -1 if a.name is 'b' or b.name is 'a'
return a.count-b.count
)
.ratio(1/3)
.padding((node) ->
if node.depth is 0
return [0,0,40,0] # make room for set labels
else if node.depth is 1
return 4
else
return 0
)
.round(false) # bugfix: d3 wrong ordering
correct_x = d3.scale.linear()
.domain([0, width])
.range([0, width*1.05])
correct_y = d3.scale.linear()
.domain([0, height])
.range([0, height*3/4])
# define a stable color scale to differentiate words and sets
color = (txt, set) ->
iset = {'a': 0, 'intersection': 1, 'b': 2}[set]
Math.seedrandom(txt+'abcdef')
noise = (W) -> Math.random()*W - W/2
d3.hcl(iset*90+noise(90), 40, 50)
# translate the viewBox to have (0,0) at the center of the vis
svg
.attr
viewBox: "#{-width/2} #{-height/2} #{width} #{height}"
# append a group for zoomable content
zoomable_layer = svg.append('g')
# define a zoom behavior
zoom = d3.behavior.zoom()
.scaleExtent([1,10]) # min-max zoom
.on 'zoom', () ->
# GEOMETRIC ZOOM
zoomable_layer
.attr
transform: "translate(#{zoom.translate()})scale(#{zoom.scale()})"
# bind the zoom behavior to the main SVG
svg.call(zoom)
# group the visualization
vis = zoomable_layer.append('g')
.attr
transform: "translate(#{-width/2},#{-height/2})"
d3.csv 'english_stopwords_long.txt', (stopwords_array) ->
# build an index of stopwords
stopwords = {}
stopwords_array.forEach (w) -> stopwords[w.word] = true
d3.text 'infovis.txt', (infovis_txt) ->
data_a = nlp.ngram(infovis_txt, {min_count: 1, max_size: 1})[0].filter (w) -> w.word not of stopwords
index_a = {}
data_a.forEach (d) ->
index_a[d.word] = d
d3.text 'hci.txt', (hci_txt) ->
data_b = nlp.ngram(hci_txt, {min_count: 1, max_size: 1})[0].filter (w) -> w.word not of stopwords
index_b = {}
data_b.forEach (d) ->
index_b[d.word] = d
diff_a = data_a.filter (a) -> a.word not of index_b
diff_b = data_b.filter (b) -> b.word not of index_a
intersection = []
data_a.forEach (a) ->
data_b.forEach (b) ->
if a.word is b.word
min = Math.min(a.count, b.count)
intersection.push {word: a.word, count: min}
if a.count-min > 0
diff_a.push {word: a.word, count: a.count-min}
if b.count-min > 0
diff_b.push {word: b.word, count: b.count-min}
a = {
children: (diff_a.filter (d) -> d.count > 1),
name: "a"
}
intersection = {
children: (intersection.filter (d) -> d.count > 1),
name: "intersection"
}
b = {
children: (diff_b.filter (d) -> d.count > 1),
name: "b"
}
tree = {
children: [a,intersection,b],
name: "root"
}
nodes_data = treemap.nodes(tree)
labels = vis.selectAll('.label')
.data(nodes_data.filter((node) -> node.depth is 2))
enter_labels = labels.enter().append('svg')
.attr
class: 'label'
enter_labels.append('text')
.text((node) -> node.word.toUpperCase())
.attr
dy: '0.35em'
fill: (node) -> color(node.word, node.parent.name)
.each (node) ->
bbox = this.getBBox()
bbox_aspect = bbox.width / bbox.height
node_bbox = {width: node.dx, height: node.dy}
node_bbox_aspect = node_bbox.width / node_bbox.height
rotate = bbox_aspect >= 1 and node_bbox_aspect < 1 or bbox_aspect < 1 and node_bbox_aspect >= 1
node.label_bbox = {
x: bbox.x+(bbox.width-correct_x(bbox.width))/2,
y: bbox.y+(bbox.height-correct_y(bbox.height))/2,
width: correct_x(bbox.width),
height: correct_y(bbox.height)
}
if rotate
node.label_bbox = {
x: node.label_bbox.y,
y: node.label_bbox.x,
width: node.label_bbox.height,
height: node.label_bbox.width
}
d3.select(this).attr('transform', 'rotate(-90)')
enter_labels
.attr
x: (node) -> node.x
y: (node) -> node.y
width: (node) -> node.dx
height: (node) -> node.dy
viewBox: (node) -> "#{node.label_bbox.x} #{node.label_bbox.y} #{node.label_bbox.width} #{node.label_bbox.height}"
preserveAspectRatio: 'none'
# draw set labels
vis.append('text')
.text('A ∖ B')
.attr
class: 'set_label'
x: a.x + a.dx/2
y: height - 22
dy: '0.35em'
vis.append('text')
.text('A ∩ B')
.attr
class: 'set_label'
x: intersection.x + intersection.dx/2
y: height - 22
dy: '0.35em'
vis.append('text')
.text('B ∖ A')
.attr
class: 'set_label'
x: b.x + b.dx/2
y: height - 22
dy: '0.35em'
svg {
background: white;
}
.node {
shape-rendering: crispEdges;
vector-effect: non-scaling-stroke;
stroke: white;
stroke-width: 2;
}
.label {
pointer-events: none;
text-anchor: middle;
font-family: Impact;
}
.set_label {
fill: #444;
font-family: serif;
font-size: 26px;
text-anchor: middle;
font-weight: bold;
}