Like the previous example, but with the option to provide a custom string. The application relies on a server-side computation that uses Oliver Mader’s code to compute the essential matching pairs.
The example shows the first 93 digits of PI.
A translucent white stroke has also been introduced to avoid mistaking palindrome repetition of matching subsequences with large subsequences (e.g. 793238 and 383279 on the left side of the visualization).
// Generated by CoffeeScript 1.10.0
(function() {
var height, redraw, svg, width;
svg = d3.select('svg');
width = svg.node().getBoundingClientRect().width;
height = svg.node().getBoundingClientRect().height;
svg.attr({
viewBox: (-width / 2) + " " + (-height + 60) + " " + width + " " + height
});
width -= 60;
redraw = function() {
var j, ref, results, sequence, x;
sequence = d3.select('#sequence_input').node().value;
x = d3.scale.ordinal().domain((function() {
results = [];
for (var j = 0, ref = sequence.length; 0 <= ref ? j <= ref : j >= ref; 0 <= ref ? j++ : j--){ results.push(j); }
return results;
}).apply(this)).rangeBands([-width / 2, width / 2]);
return d3.json("main.php?sequence=" + sequence, function(matches) {
var arcs, characters, enter_arcs, subsequence;
svg.selectAll('.character').remove();
svg.selectAll('.arc').remove();
svg.select('.subsequence').remove();
characters = svg.selectAll('.character').data(sequence);
characters.enter().append('text').text(function(d) {
return d;
}).attr({
"class": 'character',
x: function(d, i) {
return x(i) + x.rangeBand() / 2;
},
dy: '1em'
});
arcs = svg.selectAll('.arc').data(matches.filter(function(m) {
return m.length > 1;
}));
enter_arcs = arcs.enter().append('path').attr({
"class": 'arc',
d: function(match) {
var big_radius, end_left, end_right, small_radius, start_left, start_right;
start_left = x(match.source);
start_right = x(match.source + match.length);
end_left = x(match.target);
end_right = x(match.target + match.length);
big_radius = (end_right - start_left) / 2;
small_radius = (end_left - start_right) / 2;
return "M" + start_left + " 0 A" + big_radius + " " + big_radius + " 0 1 1 " + end_right + " 0 L" + end_left + " 0 A" + small_radius + " " + small_radius + " 0 1 0 " + start_right + " 0 z";
}
});
subsequence = svg.append('text').attr({
"class": 'subsequence',
y: -350
});
enter_arcs.on('click', function(d1) {
subsequence.text(d1.subsequence);
svg.selectAll('.arc').classed('selected', function(d2) {
return d1.subsequence === d2.subsequence;
}).classed('dimmed', function(d2) {
return d1.subsequence !== d2.subsequence;
});
return d3.event.stopPropagation();
});
return svg.on('click', function() {
subsequence.text('');
return svg.selectAll('.arc').classed('selected', false).classed('dimmed', false);
});
});
};
redraw();
d3.select('#sequence_input').on('change', function() {
console.log('change');
return redraw();
});
}).call(this);
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Arc Diagram III</title>
<link type="text/css" href="index.css" rel="stylesheet"/>
<script src="//d3js.org/d3.v3.min.js"></script>
</head>
<body>
<svg></svg>
<div id="input_box">
<label for="sequence_input">Sequence to be analyzed (press ENTER to redraw):</label>
<input id="sequence_input" type="text" value="314159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534">
</div>
<script src="index.js"></script>
</body>
</html>
# Copyright (C) 2013 Oliver Mader <b52@reaktor42.de>
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# -*- coding: utf-8 -*-
import naive_algorithm
from collections import defaultdict
from itertools import product, islice
from suffix_tree import SuffixTree
__all__ = [
'maximal_matching_pairs',
'repetition_regions',
'essential_matching_pairs'
]
def maximal_matching_pairs(tree):
"""
Find all substring pairs fulfilling the properties specified in
definition 1, namely _identical_, _non-overlapping_, _consecutive_ and
_maximal_.
Args:
tree (SuffixTree): A suffix tree, build from the string to be searched.
Returns:
generator. A generator of tuples, each describing one matching pair as
composed of the start of the first and the second substring,
as well as the length of the substring.
"""
s = tree.string
substrings = defaultdict(set)
# get all substring starts repeated at least once
for leaf in tree.leaves:
node = leaf.parent
end = leaf.start
while node is not None and \
(node.pathLabel not in substrings or end - len(node.pathLabel) \
not in substrings[node.pathLabel]) and \
node.edgeLabel != '':
for i in range(len(node.pathLabel)):
substrings[node.pathLabel[:i+1]].add(end - len(node.pathLabel))
end -= len(node.edgeLabel)
node = node.parent
def contained(x, y, l, largers):
for large in largers:
starts = substrings[large]
idx = list(find_all(large, sub))
for i,j in product(idx, idx):
if (x - i) + len(large) <= (y - j) and (x - i) in starts and \
(y - j) in starts:
return True
return False
# apply constraints and yield legit pairs
for sub in sorted(substrings, key=len, reverse=True):
starts = sorted(substrings[sub])
l = len(sub)
cs = [k for k in substrings if len(k) > len(sub) and
k.startswith(sub) and k.endswith(sub)]
for x, y in zip(starts, islice(starts, 1, None)):
# overlapping
if x + l > y:
continue
# non maximal: left and right expansion
if (x > 0 and x + l < y and s[x-1] == s[y-1]) or \
(y + l < len(s) and x + l < y and s[x+l] == s[y+l]):
continue
# not maximal: inner and outer expansion
if contained(x, y, l, cs):
continue
yield x, y, l
def repetition_regions(tree):
"""
Find all repetition regions as specified in definition 2 and the following
further limiting rules:
2.1) _Minimal_: There does not exist any other repetition region R'
containing R, with the fundamental substring P' containing P.
2.2) _Leftmost_: There doesn't exist another repetition region R',
originating from R shifted one character to the left, with equal
length of the region and of the fundamental substring.
Args:
tree (SuffixTree): A suffix tree, build from the string to be searched.
Returns:
list. A list of tuples, each describing one repetition region build
from multiple matching pairs. The tuples contain the start of
the region, the end of the region not inclusive and the length
of the fundamental substring, respectively.
"""
return naive_algorithm.repetition_regions(tree.string)
s = tree.string
substrings = defaultdict(set)
for leaf in tree.leaves:
node = leaf.parent
end = leaf.start
while node is not None and \
(node.pathLabel not in substrings or end - len(node.pathLabel) \
not in substrings[node.pathLabel]) and \
node.edgeLabel != '':
for i in range(len(node.pathLabel)):
substrings[node.pathLabel[:i+1]].add(end - len(node.pathLabel))
end -= len(node.edgeLabel)
node = node.parent
regions = []
for sub in sorted(substrings, key=len):
starts = sorted(substrings[sub])
l = len(sub)
a = 0
while a < len(starts) - 1:
f = starts[a]
e = f + l
b = a + 1
while b < len(starts) and starts[b] - starts[b - 1] == l:
e += l
b += 1
if b - a > 1:
regions.append((f, e, l))
a = b
return regions
def essential_matching_pairs(string):
"""
Find all essential matching pairs as specified in definition 3. These
pairs might be used to build arc diagrams from.
Args:
string (str): The string to be searched.
Returns:
generator. Yields tuples, each describing one matching pair as composed
of the start of the first and the second substring, as well
as the length of the substring.
"""
tree = SuffixTree(string)
regions = repetition_regions(tree)
# definition 3.1 and 3.2
for x,y,l in maximal_matching_pairs(tree):
if not any(x >= r and y + l <= e for r,e,_ in regions) or \
any(int((x - r)/f) == int((y + l - r - 1)/f) for r,_,f in regions):
yield x, y, l
# definition 3.3
for r,e,l in regions:
for x in range(r, e - l, l):
yield x, x + l, l
def children(node):
""" Returns a generator to iterate all direct node children. """
c = node.firstChild
while c is not None:
yield c
c = c.next
def find_all(a_str, sub):
""" Yield all occurences of a substring. """
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += 1
# vim: set expandtab shiftwidth=4 softtabstop=4 textwidth=79:
�
!SVc @ s� d d l Z d d l m Z d d l m Z m Z d d l m Z d d d g Z d � Z d � Z
d
� Z d � Z d � Z
d S(
i����N( t defaultdict( t productt islice( t
SuffixTreet maximal_matching_pairst repetition_regionst essential_matching_pairsc
# s� | j } t t � � x� | j D]� } | j } | j } x� | d k r� | j � k su | t | j � � | j k r� | j d k r� xF t
t | j � � D]/ } � | j | d j | t | j � � q� W| t | j � 8} | j } q: Wq W� � f d � } xvt � d t d t
�D]\� t � � � } t � � } g � D]B } t | � t � � k rE| j � � rE| j � � rE| ^ qE}
x� t | t | d d � � D]� \ } } | | | k r�q�n | d k r| | | k r| | d | | d k s�| | t | � k rK| | | k rK| | | | | | k rKq�n | | | | |
� rfq�n | | | f Vq�WqWd S( s�
Find all substring pairs fulfilling the properties specified in
definition 1, namely _identical_, _non-overlapping_, _consecutive_ and
_maximal_.
Args:
tree (SuffixTree): A suffix tree, build from the string to be searched.
Returns:
generator. A generator of tuples, each describing one matching pair as
composed of the start of the first and the second substring,
as well as the length of the substring.
t i c s� x� | D]� } � | } t t | � � � } x_ t | | � D]N \ } } | | t | � | | k r<