|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320 |
- """
- @author: linlin
- @references:
- [1] `ptrus/suffix-trees <https://github.com/ptrus/suffix-trees/blob/master/suffix_trees/STree.py>`__, 2018.6
- """
-
- import sys
-
-
- class STree():
- """Class representing the suffix tree. The generalized suffix tree is supported."""
-
- def __init__(self, input=''):
- self.root = _SNode()
- self.root.depth = 0
- self.root.idx = 0
- self.root.parent = self.root
- self.root._add_suffix_link(self.root)
-
- if not input == '':
- self.build(input)
-
- def _check_input(self, input):
- """Checks the validity of the input.
- In case of an invalid input throws ValueError.
- """
- if isinstance(input, str):
- return 'st'
- elif isinstance(input, list):
- if all(isinstance(item, str) for item in input):
- return 'gst'
-
- raise ValueError("String argument should be of type String or"
- " a list of strings")
-
- def build(self, x):
- """Builds the Suffix tree on the given input.
- If the input is of type List of Strings:
- Generalized Suffix Tree is built.
- :param x: String or List of Strings
- """
- type = self._check_input(x)
-
- if type == 'st':
- x += next(self._terminalSymbolsGenerator())
- self._build(x)
- if type == 'gst':
- self._build_generalized(x)
-
- def _build(self, x):
- """Builds a Suffix tree."""
- self.word = x
- self._build_McCreight(x)
-
- def _build_McCreight(self, x):
- """Builds a Suffix tree using McCreight O(n) algorithm.
- Algorithm based on:
- McCreight, Edward M. "A space-economical suffix tree construction algorithm." - ACM, 1976.
- Implementation based on:
- UH CS - 58093 String Processing Algorithms Lecture Notes
- """
- u = self.root
- d = 0
- for i in range(len(x)):
- while u.depth == d and u._has_transition(x[d + i]):
- u = u._get_transition_link(x[d + i])
- d = d + 1
- while d < u.depth and x[u.idx + d] == x[i + d]:
- d = d + 1
- if d < u.depth:
- u = self._create_node(x, u, d)
- self._create_leaf(x, i, u, d)
- if not u._get_suffix_link():
- self._compute_slink(x, u)
- u = u._get_suffix_link()
- d = d - 1
- if d < 0:
- d = 0
-
- def _create_node(self, x, u, d):
- i = u.idx
- p = u.parent
- v = _SNode(idx=i, depth=d)
- v._add_transition_link(u, x[i + d])
- u.parent = v
- p._add_transition_link(v, x[i + p.depth])
- v.parent = p
- return v
-
- def _create_leaf(self, x, i, u, d):
- w = _SNode()
- w.idx = i
- w.depth = len(x) - i
- u._add_transition_link(w, x[i + d])
- w.parent = u
- return w
-
- def _compute_slink(self, x, u):
- d = u.depth
- v = u.parent._get_suffix_link()
- while v.depth < d - 1:
- v = v._get_transition_link(x[u.idx + v.depth + 1])
- if v.depth > d - 1:
- v = self._create_node(x, v, d - 1)
- u._add_suffix_link(v)
-
- def _build_Ukkonen(self, x):
- """Builds a Suffix tree using Ukkonen's online O(n) algorithm.
- Algorithm based on:
- Ukkonen, Esko. "On-line construction of suffix trees." - Algorithmica, 1995.
- """
- # TODO.
- raise NotImplementedError()
-
- def _build_generalized(self, xs):
- """Builds a Generalized Suffix Tree (GST) from the array of strings provided.
- """
- terminal_gen = self._terminalSymbolsGenerator()
-
- _xs = ''.join([x + next(terminal_gen) for x in xs])
- self.word = _xs
- self._generalized_word_starts(xs)
- self._build(_xs)
- self.root._traverse(self._label_generalized)
-
- def _label_generalized(self, node):
- """Helper method that labels the nodes of GST with indexes of strings
- found in their descendants.
- """
- if node.is_leaf():
- x = {self._get_word_start_index(node.idx)}
- else:
- x = {
- n
- for ns in node.transition_links for n in ns[0].generalized_idxs
- }
- node.generalized_idxs = x
-
- def _get_word_start_index(self, idx):
- """Helper method that returns the index of the string based on node's
- starting index"""
- i = 0
- for _idx in self.word_starts[1:]:
- if idx < _idx:
- return i
- else:
- i += 1
- return i
-
- def lcs(self, stringIdxs=-1):
- """Returns the Largest Common Substring of Strings provided in stringIdxs.
- If stringIdxs is not provided, the LCS of all strings is returned.
- ::param stringIdxs: Optional: List of indexes of strings.
- """
- if stringIdxs == -1 or not isinstance(stringIdxs, list):
- stringIdxs = set(range(len(self.word_starts)))
- else:
- stringIdxs = set(stringIdxs)
-
- deepestNode = self._find_lcs(self.root, stringIdxs)
- start = deepestNode.idx
- end = deepestNode.idx + deepestNode.depth
- return self.word[start:end]
-
- def _find_lcs(self, node, stringIdxs):
- """Helper method that finds LCS by traversing the labeled GSD."""
- nodes = [
- self._find_lcs(n, stringIdxs) for (n, _) in node.transition_links
- if n.generalized_idxs.issuperset(stringIdxs)
- ]
-
- if nodes == []:
- return node
-
- deepestNode = max(nodes, key=lambda n: n.depth)
- return deepestNode
-
- def _generalized_word_starts(self, xs):
- """Helper method returns the starting indexes of strings in GST"""
- self.word_starts = []
- i = 0
- for n in range(len(xs)):
- self.word_starts.append(i)
- i += len(xs[n]) + 1
-
- def find(self, y):
- """Returns starting position of the substring y in the string used for
- building the Suffix tree.
- :param y: String
- :return: Index of the starting position of string y in the string used for building the Suffix tree
- -1 if y is not a substring.
- """
- node = self.root
- while True:
- edge = self._edgeLabel(node, node.parent)
- if edge.startswith(y):
- return node.idx
-
- i = 0
- while (i < len(edge) and edge[i] == y[0]):
- y = y[1:]
- i += 1
-
- if i != 0:
- if i == len(edge) and y != '':
- pass
- else:
- return -1
-
- node = node._get_transition_link(y[0])
- if not node:
- return -1
-
- def find_all(self, y):
- y_input = y
- node = self.root
- while True:
- edge = self._edgeLabel(node, node.parent)
- if edge.startswith(y):
- break
-
- i = 0
- while (i < len(edge) and edge[i] == y[0]):
- y = y[1:]
- i += 1
-
- if i != 0:
- if i == len(edge) and y != '':
- pass
- else:
- return []
-
- node = node._get_transition_link(y[0])
- if not node:
- return []
-
- leaves = node._get_leaves()
- return [n.idx for n in leaves]
-
- def _edgeLabel(self, node, parent):
- """Helper method, returns the edge label between a node and it's parent"""
- return self.word[node.idx + parent.depth:node.idx + node.depth]
-
- def _terminalSymbolsGenerator(self):
- """Generator of unique terminal symbols used for building the Generalized Suffix Tree.
- Unicode Private Use Area U+E000..U+F8FF is used to ensure that terminal symbols
- are not part of the input string.
- """
- py2 = sys.version[0] < '3'
- UPPAs = list(
- list(range(0xE000, 0xF8FF + 1)) +
- list(range(0xF0000, 0xFFFFD + 1)) +
- list(range(0x100000, 0x10FFFD + 1)))
- for i in UPPAs:
- if py2:
- yield (unichr(i))
- else:
- yield (chr(i))
- raise ValueError("To many input strings.")
-
-
- class _SNode():
- """Class representing a Node in the Suffix tree."""
-
- def __init__(self, idx=-1, parentNode=None, depth=-1):
- # Links
- self._suffix_link = None
- self.transition_links = []
- # Properties
- self.idx = idx
- self.depth = depth
- self.parent = parentNode
- self.generalized_idxs = {}
-
- def __str__(self):
- return ("SNode: idx:" + str(self.idx) + " depth:" + str(self.depth) +
- " transitons:" + str(self.transition_links))
-
- def _add_suffix_link(self, snode):
- self._suffix_link = snode
-
- def _get_suffix_link(self):
- if self._suffix_link != None:
- return self._suffix_link
- else:
- return False
-
- def _get_transition_link(self, suffix):
- for node, _suffix in self.transition_links:
- if _suffix == '__@__' or suffix == _suffix:
- return node
- return False
-
- def _add_transition_link(self, snode, suffix=''):
- tl = self._get_transition_link(suffix)
- if tl: # TODO: imporve this.
- self.transition_links.remove((tl, suffix))
- self.transition_links.append((snode, suffix))
-
- def _has_transition(self, suffix):
- for node, _suffix in self.transition_links:
- if _suffix == '__@__' or suffix == _suffix:
- return True
- return False
-
- def is_leaf(self):
- return self.transition_links == []
-
- def _traverse(self, f):
- for (node, _) in self.transition_links:
- node._traverse(f)
- f(self)
-
- def _get_leaves(self):
- if self.is_leaf():
- return [self]
- else:
- return [
- x for (n, _) in self.transition_links for x in n._get_leaves()
- ]
|