|
|
@@ -1,320 +0,0 @@ |
|
|
|
""" |
|
|
|
@author: linlin |
|
|
|
@references: |
|
|
|
[1] `ptrus/suffix-trees <https://github.com/ptrus/suffix-trees/blob/master/suffix_trees/STree.py>`__, 2018.6 |
|
|
|
""" |
|
|
|
|
|
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
class STree(): |
|
|
|
"""Class representing the suffix tree. The generalized suffix tree is supported.""" |
|
|
|
|
|
|
|
def __init__(self, input=''): |
|
|
|
self.root = _SNode() |
|
|
|
self.root.depth = 0 |
|
|
|
self.root.idx = 0 |
|
|
|
self.root.parent = self.root |
|
|
|
self.root._add_suffix_link(self.root) |
|
|
|
|
|
|
|
if not input == '': |
|
|
|
self.build(input) |
|
|
|
|
|
|
|
def _check_input(self, input): |
|
|
|
"""Checks the validity of the input. |
|
|
|
In case of an invalid input throws ValueError. |
|
|
|
""" |
|
|
|
if isinstance(input, str): |
|
|
|
return 'st' |
|
|
|
elif isinstance(input, list): |
|
|
|
if all(isinstance(item, str) for item in input): |
|
|
|
return 'gst' |
|
|
|
|
|
|
|
raise ValueError("String argument should be of type String or" |
|
|
|
" a list of strings") |
|
|
|
|
|
|
|
def build(self, x): |
|
|
|
"""Builds the Suffix tree on the given input. |
|
|
|
If the input is of type List of Strings: |
|
|
|
Generalized Suffix Tree is built. |
|
|
|
:param x: String or List of Strings |
|
|
|
""" |
|
|
|
type = self._check_input(x) |
|
|
|
|
|
|
|
if type == 'st': |
|
|
|
x += next(self._terminalSymbolsGenerator()) |
|
|
|
self._build(x) |
|
|
|
if type == 'gst': |
|
|
|
self._build_generalized(x) |
|
|
|
|
|
|
|
def _build(self, x): |
|
|
|
"""Builds a Suffix tree.""" |
|
|
|
self.word = x |
|
|
|
self._build_McCreight(x) |
|
|
|
|
|
|
|
def _build_McCreight(self, x): |
|
|
|
"""Builds a Suffix tree using McCreight O(n) algorithm. |
|
|
|
Algorithm based on: |
|
|
|
McCreight, Edward M. "A space-economical suffix tree construction algorithm." - ACM, 1976. |
|
|
|
Implementation based on: |
|
|
|
UH CS - 58093 String Processing Algorithms Lecture Notes |
|
|
|
""" |
|
|
|
u = self.root |
|
|
|
d = 0 |
|
|
|
for i in range(len(x)): |
|
|
|
while u.depth == d and u._has_transition(x[d + i]): |
|
|
|
u = u._get_transition_link(x[d + i]) |
|
|
|
d = d + 1 |
|
|
|
while d < u.depth and x[u.idx + d] == x[i + d]: |
|
|
|
d = d + 1 |
|
|
|
if d < u.depth: |
|
|
|
u = self._create_node(x, u, d) |
|
|
|
self._create_leaf(x, i, u, d) |
|
|
|
if not u._get_suffix_link(): |
|
|
|
self._compute_slink(x, u) |
|
|
|
u = u._get_suffix_link() |
|
|
|
d = d - 1 |
|
|
|
if d < 0: |
|
|
|
d = 0 |
|
|
|
|
|
|
|
def _create_node(self, x, u, d): |
|
|
|
i = u.idx |
|
|
|
p = u.parent |
|
|
|
v = _SNode(idx=i, depth=d) |
|
|
|
v._add_transition_link(u, x[i + d]) |
|
|
|
u.parent = v |
|
|
|
p._add_transition_link(v, x[i + p.depth]) |
|
|
|
v.parent = p |
|
|
|
return v |
|
|
|
|
|
|
|
def _create_leaf(self, x, i, u, d): |
|
|
|
w = _SNode() |
|
|
|
w.idx = i |
|
|
|
w.depth = len(x) - i |
|
|
|
u._add_transition_link(w, x[i + d]) |
|
|
|
w.parent = u |
|
|
|
return w |
|
|
|
|
|
|
|
def _compute_slink(self, x, u): |
|
|
|
d = u.depth |
|
|
|
v = u.parent._get_suffix_link() |
|
|
|
while v.depth < d - 1: |
|
|
|
v = v._get_transition_link(x[u.idx + v.depth + 1]) |
|
|
|
if v.depth > d - 1: |
|
|
|
v = self._create_node(x, v, d - 1) |
|
|
|
u._add_suffix_link(v) |
|
|
|
|
|
|
|
def _build_Ukkonen(self, x): |
|
|
|
"""Builds a Suffix tree using Ukkonen's online O(n) algorithm. |
|
|
|
Algorithm based on: |
|
|
|
Ukkonen, Esko. "On-line construction of suffix trees." - Algorithmica, 1995. |
|
|
|
""" |
|
|
|
# TODO. |
|
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
def _build_generalized(self, xs): |
|
|
|
"""Builds a Generalized Suffix Tree (GST) from the array of strings provided. |
|
|
|
""" |
|
|
|
terminal_gen = self._terminalSymbolsGenerator() |
|
|
|
|
|
|
|
_xs = ''.join([x + next(terminal_gen) for x in xs]) |
|
|
|
self.word = _xs |
|
|
|
self._generalized_word_starts(xs) |
|
|
|
self._build(_xs) |
|
|
|
self.root._traverse(self._label_generalized) |
|
|
|
|
|
|
|
def _label_generalized(self, node): |
|
|
|
"""Helper method that labels the nodes of GST with indexes of strings |
|
|
|
found in their descendants. |
|
|
|
""" |
|
|
|
if node.is_leaf(): |
|
|
|
x = {self._get_word_start_index(node.idx)} |
|
|
|
else: |
|
|
|
x = { |
|
|
|
n |
|
|
|
for ns in node.transition_links for n in ns[0].generalized_idxs |
|
|
|
} |
|
|
|
node.generalized_idxs = x |
|
|
|
|
|
|
|
def _get_word_start_index(self, idx): |
|
|
|
"""Helper method that returns the index of the string based on node's |
|
|
|
starting index""" |
|
|
|
i = 0 |
|
|
|
for _idx in self.word_starts[1:]: |
|
|
|
if idx < _idx: |
|
|
|
return i |
|
|
|
else: |
|
|
|
i += 1 |
|
|
|
return i |
|
|
|
|
|
|
|
def lcs(self, stringIdxs=-1): |
|
|
|
"""Returns the Largest Common Substring of Strings provided in stringIdxs. |
|
|
|
If stringIdxs is not provided, the LCS of all strings is returned. |
|
|
|
::param stringIdxs: Optional: List of indexes of strings. |
|
|
|
""" |
|
|
|
if stringIdxs == -1 or not isinstance(stringIdxs, list): |
|
|
|
stringIdxs = set(range(len(self.word_starts))) |
|
|
|
else: |
|
|
|
stringIdxs = set(stringIdxs) |
|
|
|
|
|
|
|
deepestNode = self._find_lcs(self.root, stringIdxs) |
|
|
|
start = deepestNode.idx |
|
|
|
end = deepestNode.idx + deepestNode.depth |
|
|
|
return self.word[start:end] |
|
|
|
|
|
|
|
def _find_lcs(self, node, stringIdxs): |
|
|
|
"""Helper method that finds LCS by traversing the labeled GSD.""" |
|
|
|
nodes = [ |
|
|
|
self._find_lcs(n, stringIdxs) for (n, _) in node.transition_links |
|
|
|
if n.generalized_idxs.issuperset(stringIdxs) |
|
|
|
] |
|
|
|
|
|
|
|
if nodes == []: |
|
|
|
return node |
|
|
|
|
|
|
|
deepestNode = max(nodes, key=lambda n: n.depth) |
|
|
|
return deepestNode |
|
|
|
|
|
|
|
def _generalized_word_starts(self, xs): |
|
|
|
"""Helper method returns the starting indexes of strings in GST""" |
|
|
|
self.word_starts = [] |
|
|
|
i = 0 |
|
|
|
for n in range(len(xs)): |
|
|
|
self.word_starts.append(i) |
|
|
|
i += len(xs[n]) + 1 |
|
|
|
|
|
|
|
def find(self, y): |
|
|
|
"""Returns starting position of the substring y in the string used for |
|
|
|
building the Suffix tree. |
|
|
|
:param y: String |
|
|
|
:return: Index of the starting position of string y in the string used for building the Suffix tree |
|
|
|
-1 if y is not a substring. |
|
|
|
""" |
|
|
|
node = self.root |
|
|
|
while True: |
|
|
|
edge = self._edgeLabel(node, node.parent) |
|
|
|
if edge.startswith(y): |
|
|
|
return node.idx |
|
|
|
|
|
|
|
i = 0 |
|
|
|
while (i < len(edge) and edge[i] == y[0]): |
|
|
|
y = y[1:] |
|
|
|
i += 1 |
|
|
|
|
|
|
|
if i != 0: |
|
|
|
if i == len(edge) and y != '': |
|
|
|
pass |
|
|
|
else: |
|
|
|
return -1 |
|
|
|
|
|
|
|
node = node._get_transition_link(y[0]) |
|
|
|
if not node: |
|
|
|
return -1 |
|
|
|
|
|
|
|
def find_all(self, y): |
|
|
|
y_input = y |
|
|
|
node = self.root |
|
|
|
while True: |
|
|
|
edge = self._edgeLabel(node, node.parent) |
|
|
|
if edge.startswith(y): |
|
|
|
break |
|
|
|
|
|
|
|
i = 0 |
|
|
|
while (i < len(edge) and edge[i] == y[0]): |
|
|
|
y = y[1:] |
|
|
|
i += 1 |
|
|
|
|
|
|
|
if i != 0: |
|
|
|
if i == len(edge) and y != '': |
|
|
|
pass |
|
|
|
else: |
|
|
|
return [] |
|
|
|
|
|
|
|
node = node._get_transition_link(y[0]) |
|
|
|
if not node: |
|
|
|
return [] |
|
|
|
|
|
|
|
leaves = node._get_leaves() |
|
|
|
return [n.idx for n in leaves] |
|
|
|
|
|
|
|
def _edgeLabel(self, node, parent): |
|
|
|
"""Helper method, returns the edge label between a node and it's parent""" |
|
|
|
return self.word[node.idx + parent.depth:node.idx + node.depth] |
|
|
|
|
|
|
|
def _terminalSymbolsGenerator(self): |
|
|
|
"""Generator of unique terminal symbols used for building the Generalized Suffix Tree. |
|
|
|
Unicode Private Use Area U+E000..U+F8FF is used to ensure that terminal symbols |
|
|
|
are not part of the input string. |
|
|
|
""" |
|
|
|
py2 = sys.version[0] < '3' |
|
|
|
UPPAs = list( |
|
|
|
list(range(0xE000, 0xF8FF + 1)) + |
|
|
|
list(range(0xF0000, 0xFFFFD + 1)) + |
|
|
|
list(range(0x100000, 0x10FFFD + 1))) |
|
|
|
for i in UPPAs: |
|
|
|
if py2: |
|
|
|
yield (unichr(i)) |
|
|
|
else: |
|
|
|
yield (chr(i)) |
|
|
|
raise ValueError("To many input strings.") |
|
|
|
|
|
|
|
|
|
|
|
class _SNode(): |
|
|
|
"""Class representing a Node in the Suffix tree.""" |
|
|
|
|
|
|
|
def __init__(self, idx=-1, parentNode=None, depth=-1): |
|
|
|
# Links |
|
|
|
self._suffix_link = None |
|
|
|
self.transition_links = [] |
|
|
|
# Properties |
|
|
|
self.idx = idx |
|
|
|
self.depth = depth |
|
|
|
self.parent = parentNode |
|
|
|
self.generalized_idxs = {} |
|
|
|
|
|
|
|
def __str__(self): |
|
|
|
return ("SNode: idx:" + str(self.idx) + " depth:" + str(self.depth) + |
|
|
|
" transitons:" + str(self.transition_links)) |
|
|
|
|
|
|
|
def _add_suffix_link(self, snode): |
|
|
|
self._suffix_link = snode |
|
|
|
|
|
|
|
def _get_suffix_link(self): |
|
|
|
if self._suffix_link != None: |
|
|
|
return self._suffix_link |
|
|
|
else: |
|
|
|
return False |
|
|
|
|
|
|
|
def _get_transition_link(self, suffix): |
|
|
|
for node, _suffix in self.transition_links: |
|
|
|
if _suffix == '__@__' or suffix == _suffix: |
|
|
|
return node |
|
|
|
return False |
|
|
|
|
|
|
|
def _add_transition_link(self, snode, suffix=''): |
|
|
|
tl = self._get_transition_link(suffix) |
|
|
|
if tl: # TODO: imporve this. |
|
|
|
self.transition_links.remove((tl, suffix)) |
|
|
|
self.transition_links.append((snode, suffix)) |
|
|
|
|
|
|
|
def _has_transition(self, suffix): |
|
|
|
for node, _suffix in self.transition_links: |
|
|
|
if _suffix == '__@__' or suffix == _suffix: |
|
|
|
return True |
|
|
|
return False |
|
|
|
|
|
|
|
def is_leaf(self): |
|
|
|
return self.transition_links == [] |
|
|
|
|
|
|
|
def _traverse(self, f): |
|
|
|
for (node, _) in self.transition_links: |
|
|
|
node._traverse(f) |
|
|
|
f(self) |
|
|
|
|
|
|
|
def _get_leaves(self): |
|
|
|
if self.is_leaf(): |
|
|
|
return [self] |
|
|
|
else: |
|
|
|
return [ |
|
|
|
x for (n, _) in self.transition_links for x in n._get_leaves() |
|
|
|
] |