Zotero papers doc2vec data space by micahstubbs

index.js

async function draw() {
  ////////////////////////////////////////////////////////////
  //////////////////////// Config ////////////////////////////
  ////////////////////////////////////////////////////////////

  const xVariable = 'tsne-2d-one'
  const yVariable = 'tsne-2d-two'
  const sizeVariable = 'size'
  const colorVariable = '1'
  const colorVariableType = 'continuous'
  const idVariable = 'id'
  const labelVariable = 'label'

  const yAxisLabel = ''
  const xAxisLabel = ''
  const pageTitle = '20D to 2D with t-SNE'
  const title = 'Zotero papers doc2vec data space'
  const subtitle = '20D to 2D with t-SNE'

  const colorPalette = [
    '#EFB605',
    '#E58903',
    '#E01A25',
    '#C20049',
    '#991C71',
    '#66489F',
    '#2074A0',
    '#10A66E',
    '#7EB852'
  ]

  const customXDomain = false

  ////////////////////////////////////////////////////////////
  /////////////////////// Load Data //////////////////////////
  ////////////////////////////////////////////////////////////

  const marks = await d3.csv('./tsne-coords-labels-run-1.csv')
  marks.forEach((mark, i) => {
    mark.color = i
    mark.size = 1
    mark.id = `id${i}`
    mark[xVariable] = Number(mark[xVariable])
    mark[yVariable] = Number(mark[yVariable])
  })

  ////////////////////////////////////////////////////////////
  //////////////////// Setup the Page ////////////////////////
  ////////////////////////////////////////////////////////////

  d3.select('title').html(pageTitle)
  d3.select('#title').html(title)
  d3.select('#subtitle').html(subtitle)

  // Quick fix for resizing some things for mobile-ish viewers
  const mobileScreen = $(window).innerWidth() < 500 ? true : false

  // Scatterplot
  const margin = { left: 60, top: 20, right: 80, bottom: 50 }

  const width = Math.max($('#chart').width(), 960) - margin.left - margin.right
  const height = (width * 2) / 3

  const svg = d3
    .select('#chart')
    .append('svg')
    .attr('width', width + margin.left + margin.right)
    .attr('height', height + margin.top + margin.bottom)

  const wrapper = svg
    .append('g')
    .attr('transform', `translate(${margin.left},${margin.top})`)

  //////////////////////////////////////////////////////
  ///////////// Initialize Axes & Scales ///////////////
  //////////////////////////////////////////////////////

  const opacityCircles = 0.7

  const maxDistanceFromPoint = 50

  let colorDomain

  // Set the color
  let color
  if (colorVariableType === 'continuous') {
    colorDomain = Array.from(
      new Set(marks.map(d => Number(d[colorVariable])))
    ).sort()
    color = d3
      .scaleSequential(d3.interpolateViridis)
      .domain(d3.extent(colorDomain, d => d))
  } else {
    // ordinal
    colorDomain = Array.from(new Set(marks.map(d => d[colorVariable]))).sort()
    color = d3
      .scaleOrdinal()
      .range(colorPalette)
      .domain(colorDomain)
  }

  // Set the new x axis range
  const xScale = d3.scaleLinear().range([0, width])

  if (customXDomain) {
    xScale.domain(customXDomain) // I prefer this exact scale over the true range and then using "nice"
  } else {
    xScale.domain(d3.extent(marks, d => d[xVariable])).nice()
  }
  // Set new x-axis
  const xAxis = d3
    .axisBottom()
    .ticks(6)
    // .tickFormat(d =>
    //   xScale.tickFormat(mobileScreen ? 4 : 8, d => d3.format('$.2s')(d))(d)
    // )
    .scale(xScale)
  // Append the x-axis
  wrapper
    .append('g')
    .attr('class', 'x axis')
    .attr('transform', `translate(${0},${height})`)
    .call(xAxis)

  // Set the new y axis range
  const yScale = d3
    .scaleLinear()
    .range([height, 0])
    .domain(d3.extent(marks, d => d[yVariable]))
    .nice()

  const yAxis = d3
    .axisLeft()
    .ticks(6) // Set rough # of ticks
    .scale(yScale)

  // Append the y-axis
  wrapper
    .append('g')
    .attr('class', 'y axis')
    .attr('transform', `translate(${0},${0})`)
    .call(yAxis)

  // Scale for the bubble size
  const rScale = d3
    .scaleSqrt()
    .range([mobileScreen ? 1 : 1, mobileScreen ? 10 : 4])
    // .range([mobileScreen ? 1 : 2, mobileScreen ? 10 : 16])
    .domain(d3.extent(marks, d => d[sizeVariable]))

  //////////////////////////////////////////////////////
  ///////////////// Initialize Labels //////////////////
  //////////////////////////////////////////////////////

  // Set up X axis label
  wrapper
    .append('g')
    .append('text')
    .attr('class', 'x title')
    .attr('text-anchor', 'end')
    .style('font-size', `${mobileScreen ? 8 : 12}px`)
    .attr('transform', `translate(${width},${height - 10})`)
    .text(xAxisLabel)

  // Set up y axis label
  wrapper
    .append('g')
    .append('text')
    .attr('class', 'y title')
    .attr('text-anchor', 'end')
    .style('font-size', `${mobileScreen ? 8 : 12}px`)
    .attr('transform', 'translate(18, 0) rotate(-90)')
    .text(yAxisLabel)

  ////////////////////////////////////////////////////////////
  ///// Capture mouse events and voronoi.find() the site /////
  ////////////////////////////////////////////////////////////

  // Use the same variables of the data in the .x and .y as used in the cx and cy of the circle call
  svg._tooltipped = svg.diagram = null
  svg.on('mousemove', function() {
    if (!svg.diagram) {
      console.log('computing the voronoi…')
      svg.diagram = d3
        .voronoi()
        .x(d => xScale(d[xVariable]))
        .y(d => yScale(d[yVariable]))(marks)
      console.log('…done.')
    }
    const p = d3.mouse(this)
    let site
    p[0] -= margin.left
    p[1] -= margin.top
    // don't react if the mouse is close to one of the axis
    if (p[0] < 5 || p[1] < 5) {
      site = null
    } else {
      site = svg.diagram.find(p[0], p[1], maxDistanceFromPoint)
    }
    if (site !== svg._tooltipped) {
      if (svg._tooltipped) removeTooltip(svg._tooltipped.data)
      if (site) showTooltip(site.data)
      svg._tooltipped = site
    }
  })

  ////////////////////////////////////////////////////////////
  /////////////////// Scatterplot Circles ////////////////////
  ////////////////////////////////////////////////////////////

  // Initiate a group element for the circles
  const circleGroup = wrapper.append('g').attr('class', 'circleWrapper')

  // Place the circle marks
  circleGroup
    .selectAll('marks')
    .data(marks.sort((a, b) => b[sizeVariable] > a[sizeVariable])) // Sort so the biggest circles are below
    .enter()
    .append('circle')
    .attr('class', (d, i) => `marks ${d[idVariable]}`)
    .attr('cx', d => xScale(d[xVariable]))
    .attr('cy', d => yScale(d[yVariable]))
    .attr('r', d => rScale(d[sizeVariable]))
    .style('opacity', opacityCircles)
    .style('fill', d => color(d[colorVariable]))

  ///////////////////////////////////////////////////////////////////////////
  /////////////////// Hover functions of the circles ////////////////////////
  ///////////////////////////////////////////////////////////////////////////

  // Hide the tooltip when the mouse moves away
  function removeTooltip(d, i) {
    // Save the chosen circle (so not the voronoi)
    const element = d3.selectAll(`.marks.${d[idVariable]}`)

    // Fade out the bubble again
    element.style('opacity', opacityCircles)

    // Hide tooltip
    $('.popover').each(function() {
      $(this).remove()
    })

    // Fade out guide lines, then remove them
    d3.selectAll('.guide')
      .transition()
      .duration(200)
      .style('opacity', 0)
      .remove()
  } // function removeTooltip

  // Show the tooltip on the hovered over slice
  function showTooltip(d, i) {
    // Save the chosen circle (so not the voronoi)
    const element = d3.select(`.marks.${d[idVariable]}`)

    const el = element._groups[0]
    // Define and show the tooltip
    $(el).popover({
      placement: 'auto top',
      container: '#chart',
      trigger: 'manual',
      html: true,
      content() {
        return `<span style='font-size: 11px; text-align: center;'>${d[labelVariable]}</span>`
      }
    })
    $(el).popover('show')

    // Make chosen circle more visible
    element.style('opacity', 1)

    // Place and show tooltip
    const x = +element.attr('cx')

    const y = +element.attr('cy')
    const color = element.style('fill')

    // Append lines to bubbles that will be used to show the precise data points

    // vertical line
    wrapper
      .append('line')
      .attr('class', 'guide')
      .attr('x1', x)
      .attr('x2', x)
      .attr('y1', y)
      .attr('y2', height + 20)
      .style('stroke', color)
      .style('opacity', 0)
      .transition()
      .duration(200)
      .style('opacity', 0.5)
    // Value on the axis
    wrapper
      .append('text')
      .attr('class', 'guide')
      .attr('x', x)
      .attr('y', height + 38)
      .style('fill', color)
      .style('opacity', 0)
      .style('text-anchor', 'middle')
      .text(`${d3.format('.2s')(d[xVariable])}`)
      .transition()
      .duration(200)
      .style('opacity', 0.5)

    // horizontal line
    wrapper
      .append('line')
      .attr('class', 'guide')
      .attr('x1', x)
      .attr('x2', -20)
      .attr('y1', y)
      .attr('y2', y)
      .style('stroke', color)
      .style('opacity', 0)
      .transition()
      .duration(200)
      .style('opacity', 0.5)
    // Value on the axis
    wrapper
      .append('text')
      .attr('class', 'guide')
      .attr('x', -25)
      .attr('y', y)
      .attr('dy', '0.35em')
      .style('fill', color)
      .style('opacity', 0)
      .style('text-anchor', 'end')
      .text(d3.format('.1f')(d[yVariable]))
      .transition()
      .duration(200)
      .style('opacity', 0.5)
  } // function showTooltip
}

draw()

index.html

<!DOCTYPE html>
<html>
	<head>
		<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
		<title></title>

		<!-- D3.js -->
		<script src="//d3js.org/d3.v5.js"></script>

		<!-- jQuery -->
		<script src="//ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
		<!-- Latest compiled and minified CSS -->
		<link
			rel="stylesheet"
			href="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css"
		/>
		<!-- Latest compiled and minified JavaScript -->
		<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.3.2/js/bootstrap.min.js"></script>

		<!-- Open Sans & CSS -->
		<link
			href="//fonts.googleapis.com/css?family=Open+Sans:700,400,300"
			rel="stylesheet"
			type="text/css"
		/>
		<link rel="icon" href="data:;base64,iVBORw0KGgo=" />
		<link href="styles.css" rel="stylesheet" />
	</head>
	<body>
		<div id="cont" class="container-fluid text-center">
			<div class="row scatter">
				<h5 style="color: #3B3B3B;" id="title"></h5>
				<h6 style="color: #A6A6A6;" id="subtitle"></h6>
				<div id="chart"></div>
			</div>
		</div>
		<script src="index.js"></script>
	</body>
</html>

nb.txt

nb.py

#!/usr/bin/env python
# coding: utf-8

# # embed-text-doc2vec

# based on https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

# first, let's install some dependencies. a guide to doing this: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

# In[1]:


# Install a conda package in the current Jupyter kernel
import sys
get_ipython().system('conda install --yes --prefix {sys.prefix} gensim nltk')


# In[2]:


#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize


# Let’s prepare data for training our doc2vec model

# In[3]:


data_dir = '../../data/'

# our list of documents
data = []


# In[4]:


import glob
txt_files = glob.glob(f"{data_dir}/*.txt")
print(len(txt_files))


# In[5]:


# should an example of just the filename without the path
txt_files[0][11:]


# In[6]:


for file in txt_files:
    with open(file, 'r', encoding="utf-8") as file:
        currentText = file.read()
        data.append(currentText)
        file.close()

print(len(data))


# In[7]:


from random import randrange
random_index = randrange(len(data)-1)

# print the first 1000 characters of a random document from our corpus
print(data[random_index][0:1000])


# In[7]:


tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), 
    tags=[str(i)]) for i, _d in enumerate(data)]


# Here we have a list of four sentences as training data. Now I have tagged the data and its ready for training. Lets start training our model.

# In[8]:


max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)


# In[9]:


model.build_vocab(tagged_data)


# In[10]:


for epoch in range(max_epochs):
    print ('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
model.save("d2v.model")
print("Model d2v.model Saved")


# Note: dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW). Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of words approach, which doesn’t preserve any word order.
# 
# So we have saved the model and it’s ready for implementation. Lets play with it.

# In[8]:


from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")

#to find the vector of a document which is not in the training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)



# In[9]:


# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# In[10]:


# to find vector of doc in training data using tags
# or in other words printing the vector of the document 
# at index 1 in the training data
print(model.docvecs['1'])


# In[11]:


# how many dimensions does our doc2vec document space have?
dimensions = len(model.docvecs['1'])
print(dimensions)


# Cool! This dimensionality is determined by the `vec_size` parameter we specified at training time.

# In[12]:


# create column headers for csv file
headers = ['doc']
i = 0
while i < dimensions:
    headers.append(f"v{i}")
    i+=1
    
print(headers)


# In[13]:


# retrieve vectors of all documents in training data
# write vectors to a csv file
import csv

with open('document-vectors.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"')
    writer.writerow(headers)
    
    index_count = len(data)-1
    i = 0
    while i <= index_count:
        doc_name = txt_files[i][11:]
        vec = list(model.docvecs[i])
        row = [doc_name] + vec
        writer.writerow(row)
        i += 1


# In[14]:


# read vectors in from csv file
import csv

imported_vectors = []

with open('document-vectors.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in reader:
        imported_vectors.append(row)
        
print(imported_vectors[0:2])


# In[19]:


# project from 20D to 2D with t-SNE
from __future__ import print_function
import time
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns


# In[20]:


model.docvecs


# In[21]:


df = pd.DataFrame(list(model.docvecs))

def stripPath(file):
    return file[11:]

# list(map(stripPath, txt_files))

# y is supposed to be some ground truth categories. got it. 
df['y'] = list(range(0,len(list(model.docvecs))))
df['label'] = df['y'].apply(lambda i: str(i))

# X, y = None, None

print('Size of the dataframe: {}'.format(df.shape))


# In[22]:


# For reproducibility of the results
np.random.seed(42)

rndperm = np.random.permutation(df.shape[0])


# In[23]:


time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))


# In[39]:


print(tsne_results[:10])


# In[50]:


# visualize t-SNE projection
results = df.copy()

results['tsne-2d-one'] = tsne_results[:,0]
results['tsne-2d-two'] = tsne_results[:,1]

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
#     hue="y",
    palette=sns.color_palette("hls", 141),
    data=results,
    legend="full",
    alpha=0.3
)


# In[41]:


print(results[:5])


# In[42]:


print(results['tsne-2d-two'])


# In[46]:


print(df['y'])


# In[25]:


# project from 20D to 2D with UMAP


# In[33]:


# visualize UMAP projection

styles.css

body {
  font-family: 'Open Sans', sans-serif;
  font-size: 12px;
  font-weight: 400;
  color: #525252;
  text-align: center;
}

.axis path,
.axis line {
  fill: none;
  stroke: #b3b3b3;
  shape-rendering: crispEdges;
}
.axis text {
  font-size: 10px;
  fill: #6b6b6b;
}

.marks {
  pointer-events: none;
}

.guide {
  pointer-events: none;
  font-size: 14px;
  font-weight: 600;
}

.popover {
  pointer-events: none;
}