Data in Thomas Jefferson's Garden Notebook¶

Happy Birthday, Thomas Jefferson! While running a little startup called the United States of America, TJ supported some pretty awesome science, built some pretty sweet inventions and generally was a pretty cool guy. But apparently, his real love was gardening.

"No occupation is so delightful to me as the culture of the earth, and no culture comparable to that of the garden...But though an old man, I am but a young gardener."¶

* Jefferson to Charles W. Peale, August 20, 1811. Lipscomb, Andrew A. and Albert Ellery Bergh, ed. The Writings of Thomas Jefferson, Volume 13. Washington D.C.: Issued under the auspices of the Thomas Jefferson Memorial Association of the United States, 1903-04, p. 79.

Let's see what kind of horticultural data he left behind!

you might need to install some stuff...¶

# !conda install lxml cssselect pandas bokeh -y
# !pip install pyquery coffeetools jademagic
# !npm install -g coffee-script

# !conda install lxml cssselect pandas bokeh -y
# !pip install pyquery coffeetools jademagic
# !npm install -g coffee-script

import datetime
import re
import math
import pandas
from bokeh.plotting import output_notebook, show, figure, ColumnDataSource
from bokeh.models import HoverTool, CustomJS
from bokeh.resources import CDN
import IPython
from pyquery import PyQuery
from coffeetools import coffee
output_notebook(resources=CDN)

import datetime
import re
import math
import pandas
from bokeh.plotting import output_notebook, show, figure, ColumnDataSource
from bokeh.models import HoverTool, CustomJS
from bokeh.resources import CDN
import IPython
from pyquery import PyQuery
from coffeetools import coffee
output_notebook(resources=CDN)

While you're waiting, let's read about the data set.¶

It is provided by the Massachusetts Historical Society.

IPython.display.IFrame("//www.masshist.org/thomasjeffersonpapers/garden", width="100%", height="800px")

IPython.display.IFrame("//www.masshist.org/thomasjeffersonpapers/garden", width="100%", height="800px")

Here are the URLs we need¶

garden_url = "//www.masshist.org/thomasjeffersonpapers/doc?id=garden_{page}"
img_url = "//www.masshist.org/thomasjeffersonpapers/garden/image/lg/garden_{page}_lg.jpg"

garden_url = "//www.masshist.org/thomasjeffersonpapers/doc?id=garden_{page}"
img_url = "//www.masshist.org/thomasjeffersonpapers/garden/image/lg/garden_{page}_lg.jpg"

we'll refactor this later...

page = 1

page = 1

pq = PyQuery(garden_url.format(page=page))

pq = PyQuery(garden_url.format(page=page))

looks like a pattern¶

head = pq(".head").text()
head

'1766. Shadwell.'

head = pq(".head").text()
head

'1766. Shadwell.'

year = re.match(r'\d*', head).group()
year

'1766'

year = re.match(r'\d*', head).group()
year

'1766'

location = re.match(r'\d+\.\s*(.*)\.$', head).group(1)
location

'Shadwell'

location = re.match(r'\d+\.\s*(.*)\.$', head).group(1)
location

'Shadwell'

Let's see what we've got¶

entries = [(a.text_content().strip(), b.text_content().strip())
           for a, b in list(zip(pq(".entrydate"), pq(".entry")))]
entries

[('Mar. 30.', 'Purple hyacinth begins to bloom.'),
 ('Apr. 6.', 'Narcissus and Puckoon open.'),
 ('13.', 'Puckoon flowers fallen.'),
 ('16.', 'a bluish colored, funnel-formed flower in lowgrounds in bloom.'),
 ('30.', 'purple flag blooms. Hyacinth & Narcissus gone.'),
 ('May. 4.',
  'Wild honeysuckle in our woods open. -- also the Dwarf flag & Violets'),
 ('7.', 'blue flower in lowgrounds vanished.'),
 ('11.',
  'The purple flag, Dwarf flag, Violet & wild Honeysuckle still in bloom.\nwent journey to  Maryland,  Pennsylva.,  New York. so observations cease')]

entries = [(a.text_content().strip(), b.text_content().strip())
           for a, b in list(zip(pq(".entrydate"), pq(".entry")))]
entries

[('Mar. 30.', 'Purple hyacinth begins to bloom.'),
 ('Apr. 6.', 'Narcissus and Puckoon open.'),
 ('13.', 'Puckoon flowers fallen.'),
 ('16.', 'a bluish colored, funnel-formed flower in lowgrounds in bloom.'),
 ('30.', 'purple flag blooms. Hyacinth & Narcissus gone.'),
 ('May. 4.',
  'Wild honeysuckle in our woods open. -- also the Dwarf flag & Violets'),
 ('7.', 'blue flower in lowgrounds vanished.'),
 ('11.',
  'The purple flag, Dwarf flag, Violet & wild Honeysuckle still in bloom.\nwent journey to  Maryland,  Pennsylva.,  New York. so observations cease')]

Hm, we're going to have to fill in those missing months¶

month = None

month = None

month_pattern = r"Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec"
month_idx = {abbr: i + 1 for i, abbr in enumerate(month_pattern.split("|"))}
day_pattern = r"\d+"

month_pattern = r"Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec"
month_idx = {abbr: i + 1 for i, abbr in enumerate(month_pattern.split("|"))}
day_pattern = r"\d+"

def fix_dates(entries, page=None, month=None, year=None):
    day = None
    month = None
    for date, entry in entries:
        day_match = re.findall(day_pattern, date)
        month_match = re.findall(month_pattern, date)

        day = day_match[0] if day_match else day
        month = month_match[0] if month_match else month
        
        try:
            date = datetime.date(int(year), month_idx.get(month, 1) or 1, int(day))
            yield date, int(year), month_idx.get(month, 1) or 1, page, entry
        except:
            pass

def fix_dates(entries, page=None, month=None, year=None):
    day = None
    month = None
    for date, entry in entries:
        day_match = re.findall(day_pattern, date)
        month_match = re.findall(month_pattern, date)

        day = day_match[0] if day_match else day
        month = month_match[0] if month_match else month
        
        try:
            date = datetime.date(int(year), month_idx.get(month, 1) or 1, int(day))
            yield date, int(year), month_idx.get(month, 1) or 1, page, entry
        except:
            pass

Ok, I think we're ready to be in pandas¶

df = pandas.DataFrame.from_records(
    fix_dates(entries, year=year, page=page),
    columns=["date", "year", "month", "page", "entry"],
    index=["date"])
df

df = pandas.DataFrame.from_records(
    fix_dates(entries, year=year, page=page),
    columns=["date", "year", "month", "page", "entry"],
    index=["date"])
df

Yep. Okay, let's make it pretty and interactive...¶

def hover():
    return HoverTool(
        tooltips=[
            ("Date", "$y"),
            ("Entry", "@entry"),
        ]
    )

def hover():
    return HoverTool(
        tooltips=[
            ("Date", "$y"),
            ("Entry", "@entry"),
        ]
    )

cds = ColumnDataSource(df)
p = figure(tools=[hover()], x_axis_type="datetime")
p.circle(x="date", y="year", source=cds)
show(p);

cds = ColumnDataSource(df)
p = figure(tools=[hover()], x_axis_type="datetime")
p.circle(x="date", y="year", source=cds)
show(p);

Not bad.¶

Great, let's bring it all together¶

# be nice: cache
_pq = {}

# be nice: cache
_pq = {}

year = None
location = None

all_entries = []

for page in range(1, 67):
    url = garden_url.format(page=page)
    pq = _pq.get(url, None)
    if pq is None:
        pq = _pq[url] = PyQuery(url)
    head = pq(".head").text()
    
    location_match = re.match(r'\d+\.\s*(.*)\.$', head)
    if location_match:
        location = location_match.group(1)

    year_match = re.findall(r'\d{4}', head)
    if year_match:
        year = year_match[0]
    
    entries = [(a.text_content().strip(), b.text_content().strip())
           for a, b in list(zip(pq(".entrydate"), pq(".entry")))]
    
    entries = fix_dates(entries, year=year, page=page)
    #print(list(entries))

    if entries:
        all_entries.extend(entries)

year = None
location = None

all_entries = []

for page in range(1, 67):
    url = garden_url.format(page=page)
    pq = _pq.get(url, None)
    if pq is None:
        pq = _pq[url] = PyQuery(url)
    head = pq(".head").text()
    
    location_match = re.match(r'\d+\.\s*(.*)\.$', head)
    if location_match:
        location = location_match.group(1)

    year_match = re.findall(r'\d{4}', head)
    if year_match:
        year = year_match[0]
    
    entries = [(a.text_content().strip(), b.text_content().strip())
           for a, b in list(zip(pq(".entrydate"), pq(".entry")))]
    
    entries = fix_dates(entries, year=year, page=page)
    #print(list(entries))

    if entries:
        all_entries.extend(entries)

df = pandas.DataFrame.from_records(
            all_entries,
            columns=["date", "year", "month", "page", "entry"],
            index=["date"])

df = pandas.DataFrame.from_records(
            all_entries,
            columns=["date", "year", "month", "page", "entry"],
            index=["date"])

Let's add some more columns for our viz¶

df["length"] = [math.log(len(entry)) * 3 for entry in df["entry"]]

df["length"] = [math.log(len(entry)) * 3 for entry in df["entry"]]

df["julian"] = [int(idx.strftime("%j")) for idx in df.index]

df["julian"] = [int(idx.strftime("%j")) for idx in df.index]

Oh yes, and about those images...¶

df["month_name"] = df.month.apply(lambda e: datetime.datetime(2001,e,1).strftime("%B"))

df["month_name"] = df.month.apply(lambda e: datetime.datetime(2001,e,1).strftime("%B"))

pages = pandas.DataFrame.from_records(
    [(min(entries), min(entries).strftime("%j"), page, img_url.format(page=page))
    for page, entries in df.groupby(by=["page"]).groups.items()],
    columns=["date", "julian", "page", "url"],
    index="date"
)
pages.head()

pages = pandas.DataFrame.from_records(
    [(min(entries), min(entries).strftime("%j"), page, img_url.format(page=page))
    for page, entries in df.groupby(by=["page"]).groups.items()],
    columns=["date", "julian", "page", "url"],
    index="date"
)
pages.head()

Nice. Please show me the money.¶

p = figure(title="TJ's Garden Notes (1766-1824)", 
           y_axis_type="datetime", y_axis_label="Date",
           x_axis_label="Day of Year")
p.circle(x="julian", y="date", size="length", alpha=0.5,
         source=ColumnDataSource(df))
show(p);

p = figure(title="TJ's Garden Notes (1766-1824)", 
           y_axis_type="datetime", y_axis_label="Date",
           x_axis_label="Day of Year")
p.circle(x="julian", y="date", size="length", alpha=0.5,
         source=ColumnDataSource(df))
show(p);

pages.url = pages.url.apply(lambda v: v.replace('lg','thumb'))

pages.url = pages.url.apply(lambda v: v.replace('lg','thumb'))

df['url'] = """"""
for i, v in pages.url.items():
    pages.set_value(i,'url',v.replace('lg','thumb'))

df['url'] = """"""
for i, v in pages.url.items():
    pages.set_value(i,'url',v.replace('lg','thumb'))

p = figure(title="TJ's Garden Book(1766-1824)",
           y_axis_type="datetime", y_axis_label="Date",
           x_axis_label="Day of Year",
           width=600,
           height=800)

circle = p.circle(x="julian", y="date", size="length", alpha=0.5,
                  source=ColumnDataSource(df))
images = p.image_url(x="julian", y="date", url="url", anchor='center', global_alpha=0.2,
                     source=ColumnDataSource(pages))

p.add_tools(HoverTool(
        renderers=[circle],
        tooltips=None,
        callback=CustomJS(
            args={
                'circle': circle.data_source,
                'images': images.data_source
            }, code=r"""

if(window.moment){
    update();
}

function update() {
    var cdata = circle.get('data'),
        indices = cb_data.index['1d'].indices,
        info = $('#info').text(''),
        page = cdata['page'][indices[0]],
        pageIdx = _.indexOf(images.attributes.data.page, page),
        url = images.attributes.data.url[pageIdx],
        img = url && $('#jefferson-img-lg').css({
            "background-image": "url(" + url.replace(/thumb/g, "lg") + ")",
            "background-size": "contain",
            "background-position": "middle middle",
            "background-repeat": "no-repeat",
            "position": "absolute",
            top: 0,
            left: 0,
            right: 0,
            bottom: 0
        });

    indices.map(function(index){
        $("<div/>")
            .css({"padding": "1rem", "line-height": "2rem"})
            .append([
                $("<h4/>").text(moment(new Date(cdata['date'][index]))
                    .format("dddd, MMMM D, YYYY")),
                $("<ul/>").append(
                    cdata['entry'][index].split("\n")
                        .filter(function(line){ return line.trim(); })
                        .map(function(line){
                            return $("<li/>").text(line);
                        })
                )
            ])
            .appendTo(info);
    });
}
                """)))

p.x_range.callback = CustomJS(
    args={
        'x_axis': p.x_range,
        'y_axis': p.y_range,
        'images': images.glyph,
    }, 
    code=coffee.compile(
        """
        dx = x_axis.get('end') - x_axis.get('start')
        lower = 1
        if dx > lower
            images.set 'global_alpha', .2 + (.7)*Math.exp( -1*Math.pow(dx-lower,2)/(2*Math.pow(50,2)))
        """,
        bare=True))
show(p);

p = figure(title="TJ's Garden Book(1766-1824)",
           y_axis_type="datetime", y_axis_label="Date",
           x_axis_label="Day of Year",
           width=600,
           height=800)

circle = p.circle(x="julian", y="date", size="length", alpha=0.5,
                  source=ColumnDataSource(df))
images = p.image_url(x="julian", y="date", url="url", anchor='center', global_alpha=0.2,
                     source=ColumnDataSource(pages))

p.add_tools(HoverTool(
        renderers=[circle],
        tooltips=None,
        callback=CustomJS(
            args={
                'circle': circle.data_source,
                'images': images.data_source
            }, code=r"""

if(window.moment){
    update();
}

function update() {
    var cdata = circle.get('data'),
        indices = cb_data.index['1d'].indices,
        info = $('#info').text(''),
        page = cdata['page'][indices[0]],
        pageIdx = _.indexOf(images.attributes.data.page, page),
        url = images.attributes.data.url[pageIdx],
        img = url && $('#jefferson-img-lg').css({
            "background-image": "url(" + url.replace(/thumb/g, "lg") + ")",
            "background-size": "contain",
            "background-position": "middle middle",
            "background-repeat": "no-repeat",
            "position": "absolute",
            top: 0,
            left: 0,
            right: 0,
            bottom: 0
        });

    indices.map(function(index){
        $("<div/>")
            .css({"padding": "1rem", "line-height": "2rem"})
            .append([
                $("<h4/>").text(moment(new Date(cdata['date'][index]))
                    .format("dddd, MMMM D, YYYY")),
                $("<ul/>").append(
                    cdata['entry'][index].split("\n")
                        .filter(function(line){ return line.trim(); })
                        .map(function(line){
                            return $("<li/>").text(line);
                        })
                )
            ])
            .appendTo(info);
    });
}
                """)))

p.x_range.callback = CustomJS(
    args={
        'x_axis': p.x_range,
        'y_axis': p.y_range,
        'images': images.glyph,
    }, 
    code=coffee.compile(
        """
        dx = x_axis.get('end') - x_axis.get('start')
        lower = 1
        if dx > lower
            images.set 'global_alpha', .2 + (.7)*Math.exp( -1*Math.pow(dx-lower,2)/(2*Math.pow(50,2)))
        """,
        bare=True))
show(p);

df.to_csv('jefferson.csv')

df.to_csv('jefferson.csv')

%%html
<div id="info"></div>

%%html
<div id="info"></div>

%%html
<div id="jefferson-img-lg"></div>

%%html
<div id="jefferson-img-lg"></div>

%%javascript
nbpresent.mode.themes.set({
"default": "jefferson",
"theme": {
    jefferson: {
        palette: {
            "paper": {
                id: "paper",
                rgb: [252, 248, 237]
            },
            "ink": {
                id: "ink",
                rgb: [65, 55, 41]
            },
            "highlight": {
                id: "highlight",
                rgb: [162, 137, 100]
            }
        },
        backgrounds: {
            "paper": {
                "background-color": "paper"
            }
        },
        "text-base": {
            "font-family": "Vollkorn",
            "font-size": 1.5,
            "color": "ink"
        },
        rules: {
            h1: {
                "font-size": 7,
                "color": "ink"
            },
            h2: {
                "color": "highlight",
                "font-size": 5,
            },
            h4: {
                "font-size": 2,
                "color": "highlight",
                "text-align": "center",
                "border-bottom": "solid 1px rgb(162, 137, 100)"
            },
            blockquote: {
                "text-align": "justify",
                "border": "none",
                "font-style": "italic"
            }
        }
    }
}});

%%javascript
nbpresent.mode.themes.set({
"default": "jefferson",
"theme": {
    jefferson: {
        palette: {
            "paper": {
                id: "paper",
                rgb: [252, 248, 237]
            },
            "ink": {
                id: "ink",
                rgb: [65, 55, 41]
            },
            "highlight": {
                id: "highlight",
                rgb: [162, 137, 100]
            }
        },
        backgrounds: {
            "paper": {
                "background-color": "paper"
            }
        },
        "text-base": {
            "font-family": "Vollkorn",
            "font-size": 1.5,
            "color": "ink"
        },
        rules: {
            h1: {
                "font-size": 7,
                "color": "ink"
            },
            h2: {
                "color": "highlight",
                "font-size": 5,
            },
            h4: {
                "font-size": 2,
                "color": "highlight",
                "text-align": "center",
                "border-bottom": "solid 1px rgb(162, 137, 100)"
            },
            blockquote: {
                "text-align": "justify",
                "border": "none",
                "font-style": "italic"
            }
        }
    }
}});

%%html
<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore.js"></script>
<script>
$(function(){
            
    requirejs({
        paths: {moment: "https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.12.0/moment.min"}
    },
    ["moment"],
    function(moment){
        window.moment = moment;
        update();
    });
})
</script>

%%html
<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore.js"></script>
<script>
$(function(){
            
    requirejs({
        paths: {moment: "https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.12.0/moment.min"}
    },
    ["moment"],
    function(moment){
        window.moment = moment;
        update();
    });
})
</script>

	julian	page	url
date
1766-03-30	089	1	//www.masshist.org/thomasjeffersonpapers/...
1767-02-09	040	2	//www.masshist.org/thomasjeffersonpapers/...
1767-05-27	147	3	//www.masshist.org/thomasjeffersonpapers/...
1786-02-24	055	4	//www.masshist.org/thomasjeffersonpapers/...
1769-03-14	073	5	//www.masshist.org/thomasjeffersonpapers/...

	year	month	page	entry
date
1766-03-30	1766	3	1	Purple hyacinth begins to bloom.
1766-04-06	1766	4	1	Narcissus and Puckoon open.
1766-04-13	1766	4	1	Puckoon flowers fallen.
1766-04-16	1766	4	1	a bluish colored, funnel-formed flower in lowg...
1766-04-30	1766	4	1	purple flag blooms. Hyacinth & Narcissus gone.
1766-05-04	1766	5	1	Wild honeysuckle in our woods open. -- also th...
1766-05-07	1766	5	1	blue flower in lowgrounds vanished.
1766-05-11	1766	5	1	The purple flag, Dwarf flag, Violet & wild Hon...

	inspect Hover Tool

	inspect Hover Tool