doc2vec data space on my Zotero papers
20D to 2D with t-SNE
forked from Voronoi Scatterplot with diagram.find()
async function draw() {
////////////////////////////////////////////////////////////
//////////////////////// Config ////////////////////////////
////////////////////////////////////////////////////////////
const xVariable = 'tsne-2d-one'
const yVariable = 'tsne-2d-two'
const sizeVariable = 'size'
const colorVariable = '1'
const colorVariableType = 'continuous'
const idVariable = 'id'
const labelVariable = 'label'
const yAxisLabel = ''
const xAxisLabel = ''
const pageTitle = '20D to 2D with t-SNE'
const title = 'Zotero papers doc2vec data space'
const subtitle = '20D to 2D with t-SNE'
const colorPalette = [
'#EFB605',
'#E58903',
'#E01A25',
'#C20049',
'#991C71',
'#66489F',
'#2074A0',
'#10A66E',
'#7EB852'
]
const customXDomain = false
////////////////////////////////////////////////////////////
/////////////////////// Load Data //////////////////////////
////////////////////////////////////////////////////////////
const marks = await d3.csv('./tsne-coords-labels-run-1.csv')
marks.forEach((mark, i) => {
mark.color = i
mark.size = 1
mark.id = `id${i}`
mark[xVariable] = Number(mark[xVariable])
mark[yVariable] = Number(mark[yVariable])
})
////////////////////////////////////////////////////////////
//////////////////// Setup the Page ////////////////////////
////////////////////////////////////////////////////////////
d3.select('title').html(pageTitle)
d3.select('#title').html(title)
d3.select('#subtitle').html(subtitle)
// Quick fix for resizing some things for mobile-ish viewers
const mobileScreen = $(window).innerWidth() < 500 ? true : false
// Scatterplot
const margin = { left: 60, top: 20, right: 80, bottom: 50 }
const width = Math.max($('#chart').width(), 960) - margin.left - margin.right
const height = (width * 2) / 3
const svg = d3
.select('#chart')
.append('svg')
.attr('width', width + margin.left + margin.right)
.attr('height', height + margin.top + margin.bottom)
const wrapper = svg
.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`)
//////////////////////////////////////////////////////
///////////// Initialize Axes & Scales ///////////////
//////////////////////////////////////////////////////
const opacityCircles = 0.7
const maxDistanceFromPoint = 50
let colorDomain
// Set the color
let color
if (colorVariableType === 'continuous') {
colorDomain = Array.from(
new Set(marks.map(d => Number(d[colorVariable])))
).sort()
color = d3
.scaleSequential(d3.interpolateViridis)
.domain(d3.extent(colorDomain, d => d))
} else {
// ordinal
colorDomain = Array.from(new Set(marks.map(d => d[colorVariable]))).sort()
color = d3
.scaleOrdinal()
.range(colorPalette)
.domain(colorDomain)
}
// Set the new x axis range
const xScale = d3.scaleLinear().range([0, width])
if (customXDomain) {
xScale.domain(customXDomain) // I prefer this exact scale over the true range and then using "nice"
} else {
xScale.domain(d3.extent(marks, d => d[xVariable])).nice()
}
// Set new x-axis
const xAxis = d3
.axisBottom()
.ticks(6)
// .tickFormat(d =>
// xScale.tickFormat(mobileScreen ? 4 : 8, d => d3.format('$.2s')(d))(d)
// )
.scale(xScale)
// Append the x-axis
wrapper
.append('g')
.attr('class', 'x axis')
.attr('transform', `translate(${0},${height})`)
.call(xAxis)
// Set the new y axis range
const yScale = d3
.scaleLinear()
.range([height, 0])
.domain(d3.extent(marks, d => d[yVariable]))
.nice()
const yAxis = d3
.axisLeft()
.ticks(6) // Set rough # of ticks
.scale(yScale)
// Append the y-axis
wrapper
.append('g')
.attr('class', 'y axis')
.attr('transform', `translate(${0},${0})`)
.call(yAxis)
// Scale for the bubble size
const rScale = d3
.scaleSqrt()
.range([mobileScreen ? 1 : 1, mobileScreen ? 10 : 4])
// .range([mobileScreen ? 1 : 2, mobileScreen ? 10 : 16])
.domain(d3.extent(marks, d => d[sizeVariable]))
//////////////////////////////////////////////////////
///////////////// Initialize Labels //////////////////
//////////////////////////////////////////////////////
// Set up X axis label
wrapper
.append('g')
.append('text')
.attr('class', 'x title')
.attr('text-anchor', 'end')
.style('font-size', `${mobileScreen ? 8 : 12}px`)
.attr('transform', `translate(${width},${height - 10})`)
.text(xAxisLabel)
// Set up y axis label
wrapper
.append('g')
.append('text')
.attr('class', 'y title')
.attr('text-anchor', 'end')
.style('font-size', `${mobileScreen ? 8 : 12}px`)
.attr('transform', 'translate(18, 0) rotate(-90)')
.text(yAxisLabel)
////////////////////////////////////////////////////////////
///// Capture mouse events and voronoi.find() the site /////
////////////////////////////////////////////////////////////
// Use the same variables of the data in the .x and .y as used in the cx and cy of the circle call
svg._tooltipped = svg.diagram = null
svg.on('mousemove', function() {
if (!svg.diagram) {
console.log('computing the voronoi…')
svg.diagram = d3
.voronoi()
.x(d => xScale(d[xVariable]))
.y(d => yScale(d[yVariable]))(marks)
console.log('…done.')
}
const p = d3.mouse(this)
let site
p[0] -= margin.left
p[1] -= margin.top
// don't react if the mouse is close to one of the axis
if (p[0] < 5 || p[1] < 5) {
site = null
} else {
site = svg.diagram.find(p[0], p[1], maxDistanceFromPoint)
}
if (site !== svg._tooltipped) {
if (svg._tooltipped) removeTooltip(svg._tooltipped.data)
if (site) showTooltip(site.data)
svg._tooltipped = site
}
})
////////////////////////////////////////////////////////////
/////////////////// Scatterplot Circles ////////////////////
////////////////////////////////////////////////////////////
// Initiate a group element for the circles
const circleGroup = wrapper.append('g').attr('class', 'circleWrapper')
// Place the circle marks
circleGroup
.selectAll('marks')
.data(marks.sort((a, b) => b[sizeVariable] > a[sizeVariable])) // Sort so the biggest circles are below
.enter()
.append('circle')
.attr('class', (d, i) => `marks ${d[idVariable]}`)
.attr('cx', d => xScale(d[xVariable]))
.attr('cy', d => yScale(d[yVariable]))
.attr('r', d => rScale(d[sizeVariable]))
.style('opacity', opacityCircles)
.style('fill', d => color(d[colorVariable]))
///////////////////////////////////////////////////////////////////////////
/////////////////// Hover functions of the circles ////////////////////////
///////////////////////////////////////////////////////////////////////////
// Hide the tooltip when the mouse moves away
function removeTooltip(d, i) {
// Save the chosen circle (so not the voronoi)
const element = d3.selectAll(`.marks.${d[idVariable]}`)
// Fade out the bubble again
element.style('opacity', opacityCircles)
// Hide tooltip
$('.popover').each(function() {
$(this).remove()
})
// Fade out guide lines, then remove them
d3.selectAll('.guide')
.transition()
.duration(200)
.style('opacity', 0)
.remove()
} // function removeTooltip
// Show the tooltip on the hovered over slice
function showTooltip(d, i) {
// Save the chosen circle (so not the voronoi)
const element = d3.select(`.marks.${d[idVariable]}`)
const el = element._groups[0]
// Define and show the tooltip
$(el).popover({
placement: 'auto top',
container: '#chart',
trigger: 'manual',
html: true,
content() {
return `<span style='font-size: 11px; text-align: center;'>${d[labelVariable]}</span>`
}
})
$(el).popover('show')
// Make chosen circle more visible
element.style('opacity', 1)
// Place and show tooltip
const x = +element.attr('cx')
const y = +element.attr('cy')
const color = element.style('fill')
// Append lines to bubbles that will be used to show the precise data points
// vertical line
wrapper
.append('line')
.attr('class', 'guide')
.attr('x1', x)
.attr('x2', x)
.attr('y1', y)
.attr('y2', height + 20)
.style('stroke', color)
.style('opacity', 0)
.transition()
.duration(200)
.style('opacity', 0.5)
// Value on the axis
wrapper
.append('text')
.attr('class', 'guide')
.attr('x', x)
.attr('y', height + 38)
.style('fill', color)
.style('opacity', 0)
.style('text-anchor', 'middle')
.text(`${d3.format('.2s')(d[xVariable])}`)
.transition()
.duration(200)
.style('opacity', 0.5)
// horizontal line
wrapper
.append('line')
.attr('class', 'guide')
.attr('x1', x)
.attr('x2', -20)
.attr('y1', y)
.attr('y2', y)
.style('stroke', color)
.style('opacity', 0)
.transition()
.duration(200)
.style('opacity', 0.5)
// Value on the axis
wrapper
.append('text')
.attr('class', 'guide')
.attr('x', -25)
.attr('y', y)
.attr('dy', '0.35em')
.style('fill', color)
.style('opacity', 0)
.style('text-anchor', 'end')
.text(d3.format('.1f')(d[yVariable]))
.transition()
.duration(200)
.style('opacity', 0.5)
} // function showTooltip
}
draw()
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<title></title>
<!-- D3.js -->
<script src="//d3js.org/d3.v5.js"></script>
<!-- jQuery -->
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
<!-- Latest compiled and minified CSS -->
<link
rel="stylesheet"
href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css"
/>
<!-- Latest compiled and minified JavaScript -->
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/js/bootstrap.min.js"></script>
<!-- Open Sans & CSS -->
<link
href="//fonts.googleapis.com/css?family=Open+Sans:700,400,300"
rel="stylesheet"
type="text/css"
/>
<link rel="icon" href="data:;base64,iVBORw0KGgo=" />
<link href="styles.css" rel="stylesheet" />
</head>
<body>
<div id="cont" class="container-fluid text-center">
<div class="row scatter">
<h5 style="color: #3B3B3B;" id="title"></h5>
<h6 style="color: #A6A6A6;" id="subtitle"></h6>
<div id="chart"></div>
</div>
</div>
<script src="index.js"></script>
</body>
</html>
#!/usr/bin/env python
# coding: utf-8
# # embed-text-doc2vec
# based on https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
# first, let's install some dependencies. a guide to doing this: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
# In[1]:
# Install a conda package in the current Jupyter kernel
import sys
get_ipython().system('conda install --yes --prefix {sys.prefix} gensim nltk')
# In[2]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
# Let’s prepare data for training our doc2vec model
# In[3]:
data_dir = '../../data/'
# our list of documents
data = []
# In[4]:
import glob
txt_files = glob.glob(f"{data_dir}/*.txt")
print(len(txt_files))
# In[5]:
# should an example of just the filename without the path
txt_files[0][11:]
# In[6]:
for file in txt_files:
with open(file, 'r', encoding="utf-8") as file:
currentText = file.read()
data.append(currentText)
file.close()
print(len(data))
# In[7]:
from random import randrange
random_index = randrange(len(data)-1)
# print the first 1000 characters of a random document from our corpus
print(data[random_index][0:1000])
# In[7]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()),
tags=[str(i)]) for i, _d in enumerate(data)]
# Here we have a list of four sentences as training data. Now I have tagged the data and its ready for training. Lets start training our model.
# In[8]:
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
dm=1)
# In[9]:
model.build_vocab(tagged_data)
# In[10]:
for epoch in range(max_epochs):
print ('iteration {0}'.format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save("d2v.model")
print("Model d2v.model Saved")
# Note: dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW). Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of words approach, which doesn’t preserve any word order.
#
# So we have saved the model and it’s ready for implementation. Lets play with it.
# In[8]:
from gensim.models.doc2vec import Doc2Vec
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in the training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)
# In[9]:
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)
# In[10]:
# to find vector of doc in training data using tags
# or in other words printing the vector of the document
# at index 1 in the training data
print(model.docvecs['1'])
# In[11]:
# how many dimensions does our doc2vec document space have?
dimensions = len(model.docvecs['1'])
print(dimensions)
# Cool! This dimensionality is determined by the `vec_size` parameter we specified at training time.
# In[12]:
# create column headers for csv file
headers = ['doc']
i = 0
while i < dimensions:
headers.append(f"v{i}")
i+=1
print(headers)
# In[13]:
# retrieve vectors of all documents in training data
# write vectors to a csv file
import csv
with open('document-vectors.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"')
writer.writerow(headers)
index_count = len(data)-1
i = 0
while i <= index_count:
doc_name = txt_files[i][11:]
vec = list(model.docvecs[i])
row = [doc_name] + vec
writer.writerow(row)
i += 1
# In[14]:
# read vectors in from csv file
import csv
imported_vectors = []
with open('document-vectors.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in reader:
imported_vectors.append(row)
print(imported_vectors[0:2])
# In[19]:
# project from 20D to 2D with t-SNE
from __future__ import print_function
import time
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
# In[20]:
model.docvecs
# In[21]:
df = pd.DataFrame(list(model.docvecs))
def stripPath(file):
return file[11:]
# list(map(stripPath, txt_files))
# y is supposed to be some ground truth categories. got it.
df['y'] = list(range(0,len(list(model.docvecs))))
df['label'] = df['y'].apply(lambda i: str(i))
# X, y = None, None
print('Size of the dataframe: {}'.format(df.shape))
# In[22]:
# For reproducibility of the results
np.random.seed(42)
rndperm = np.random.permutation(df.shape[0])
# In[23]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
# In[39]:
print(tsne_results[:10])
# In[50]:
# visualize t-SNE projection
results = df.copy()
results['tsne-2d-one'] = tsne_results[:,0]
results['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
# hue="y",
palette=sns.color_palette("hls", 141),
data=results,
legend="full",
alpha=0.3
)
# In[41]:
print(results[:5])
# In[42]:
print(results['tsne-2d-two'])
# In[46]:
print(df['y'])
# In[25]:
# project from 20D to 2D with UMAP
# In[33]:
# visualize UMAP projection
body {
font-family: 'Open Sans', sans-serif;
font-size: 12px;
font-weight: 400;
color: #525252;
text-align: center;
}
.axis path,
.axis line {
fill: none;
stroke: #b3b3b3;
shape-rendering: crispEdges;
}
.axis text {
font-size: 10px;
fill: #6b6b6b;
}
.marks {
pointer-events: none;
}
.guide {
pointer-events: none;
font-size: 14px;
font-weight: 600;
}
.popover {
pointer-events: none;
}