first experiment with opacity

3 years ago · b8343c650f
commit b8343c650f
36 changed files with 6673 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,4 @@
+
+opacity experiment using:
+* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score.
+* wikipedia python module (https://pypi.org/project/wikipedia/)
--- a/make.py
+++ b/make.py
@ -0,0 +1,233 @@
+from jinja2 import Template
+import os
+import wikipedia
+from markdown import markdown
+
+# importing module
+import sys
+  
+# appending a path
+# sys.path.append('textrank')
+  
+# importing required module
+import summa.summarizer
+from summa.summarizer import summarize
+
+
+# TODO:
+# * DONE: wiki header
+
+# those 3 would ask to start from the HTML itself and keep and index...
+# * wiki paragraph
+# * wiki hyperlinks
+# * list
+
+
+#   variables
+#   ------------------------------------------------------------------------
+
+# wikipedia_page = "forest"
+# wikipedia_page = "warehouse"
+# wikipedia_page = "river"
+wikipedia_page = "elderflower"
+# wikipedia_page = "mushroom"
+
+TEMPLATE_PATH = 'template.html'
+HTML_PATH = 'www/index.html'
+
+
+#   utilities
+#   ------------------------------------------------------------------------
+
+def map_value(value, min, max, new_min, new_max):
+  return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
+
+def remap_score(s, min_score, max_score):
+    s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
+    return s
+
+def compress_score(s):
+
+    # compress whites
+    s.score = s.score**3
+
+    # stretch + limiter
+    # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
+    s.score = 1 if s.score > 0.8 else s.score 
+
+    return s
+
+
+#   wikipedia
+#   ------------------------------------------------------------------------
+
+def wikipage(pagename):
+    # get wikipedia page content by name of the page
+
+    print(pagename)
+    wikipedia.set_lang("en")
+    try:
+        results = wikipedia.search(pagename, results=1, suggestion=False)
+        try:
+            pagename = results[0]
+        except IndexError:
+            # if there is no suggestion or search results, the page doesn't exist
+            raise wikipedia.PageError(pagename)
+        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
+    except wikipedia.exceptions.DisambiguationError as e:
+        print(e.options)
+        page = ''
+
+    return page
+
+
+#   parsing and gluing html
+#   ------------------------------------------------------------------------
+
+def is_header(s):
+
+    # i is the header level
+    i = 0
+    while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
+        i += 1
+
+    if i > 0:
+        header_text = s.text[i:(-1-i)].strip()
+        header_level = i
+        return [header_text, header_level]
+
+def wiki_parse(sentences):
+
+    # TODO: doesn't work with section nesting!!
+    # 1. replace wikitext header with html header
+    # 2. add the opacity to each elements
+    # 3. compute an artificial score for header that is an average of the score of the section
+
+    new_sentences = []
+
+    print('--- HEADERS ---')
+    for i in range(len(sentences)):
+
+        s = sentences[i]
+
+        # if sentences is header
+        header = is_header(s)
+        if header:
+            print(header[0])
+
+            # start computing the average of score of this section
+            current_total = 0
+            current_count = 0
+            next_header_found = False
+            j = i + 1
+
+            # iterating while we find next header with greatest or same level
+            while j < len(sentences) and not next_header_found:
+
+                s2 = sentences[j]
+                s2_header = is_header(s2)
+
+                if s2_header:
+                    print('  ' + s2_header[0])
+                    if header[1] >= s2_header[1]:
+                        # encounter header of higher level
+                        next_header_found = True
+                        print('X ' + s2_header[0])
+
+                else:
+                    # adding every sentence to the average
+                    current_total += s2.score
+                    current_count += 1
+                
+                j += 1
+
+            if current_count != 0:
+                s.score = current_total / current_count
+            else:
+                s.score = "NaN"
+
+            s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
+
+            # stops at the references part
+            if header[0] == "References" or header[0] == "See also":
+                break
+
+            new_sentences.append(s)
+
+        # not a header
+        else:
+            s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
+            new_sentences.append(s)
+
+    return new_sentences
+
+
+#   textrank
+#   ------------------------------------------------------------------------
+
+def txt2rankedsentences(txt):
+    # from txt to ranked sentences
+    return summarize(txt, split=True)
+
+
+#   main
+#   ------------------------------------------------------------------------
+
+if __name__ == '__main__':
+
+
+    # --- WIKI REQUEST ---
+
+    # get text from wikipedia
+    print('--- WIKI ---')
+    page = wikipage(wikipedia_page)
+    if not page:
+        sys.exit("--- STOP ---")
+    title = '<h1>'+page.title+'</h1>'
+    text = page.content
+
+    # print text in terminal
+    print('--- TXT ---')
+    print(text)
+
+
+    # --- APPLY TEXTRANK ---
+
+    # apply textrank
+    sentences = txt2rankedsentences(text)
+
+    # print ranked sentences in terminal
+    print('--- SENTENCES ---')
+    for s in sentences:
+        print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
+
+
+    # --- REMAP AND COMPRESS ---
+
+    # sorted version of the list
+    sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
+    # remap sentences from 0 to 1
+    max_score = sorted_sentences[0].score
+    min_score = sorted_sentences[-1].score
+    sentences = [remap_score(s, min_score, max_score) for s in sentences]
+    # compress scores (make more stuff invisible)
+    sentences = [compress_score(s) for s in sentences]
+
+
+    # -- PARSE ---
+
+    # parse every sentences to either span or header
+    sentences = wiki_parse(sentences)
+    # add back page title
+    sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
+
+
+    # -- TEMPLATING ---
+
+    # getting the template
+    with open(TEMPLATE_PATH, 'r') as file:
+        template = Template(file.read())
+    # render template
+    html = template.render(sentences = sentences)
+    with open(HTML_PATH, 'w') as file:
+        file.write(html)
--- a/summa/init.py
+++ b/summa/init.py
@ -0,0 +1,2 @@
+from summa import commons, graph, keywords, pagerank_weighted, \
+                  summarizer, syntactic_unit, textrank
--- a/summa/pycache/init.cpython-38.pyc
+++ b/summa/pycache/init.cpython-38.pyc
--- a/summa/pycache/commons.cpython-38.pyc
+++ b/summa/pycache/commons.cpython-38.pyc
--- a/summa/pycache/graph.cpython-38.pyc
+++ b/summa/pycache/graph.cpython-38.pyc
--- a/summa/pycache/keywords.cpython-38.pyc
+++ b/summa/pycache/keywords.cpython-38.pyc
--- a/summa/pycache/pagerank_weighted.cpython-38.pyc
+++ b/summa/pycache/pagerank_weighted.cpython-38.pyc
--- a/summa/pycache/summarizer.cpython-38.pyc
+++ b/summa/pycache/summarizer.cpython-38.pyc
--- a/summa/pycache/syntactic_unit.cpython-38.pyc
+++ b/summa/pycache/syntactic_unit.cpython-38.pyc
--- a/summa/pycache/textrank.cpython-38.pyc
+++ b/summa/pycache/textrank.cpython-38.pyc
--- a/summa/commons.py
+++ b/summa/commons.py
@ -0,0 +1,15 @@
+from .graph import Graph
+
+
+def build_graph(sequence):
+    graph = Graph()
+    for item in sequence:
+        if not graph.has_node(item):
+            graph.add_node(item)
+    return graph
+
+
+def remove_unreachable_nodes(graph):
+    for node in graph.nodes():
+        if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
+            graph.del_node(node)
--- a/summa/exception/init.py
+++ b/summa/exception/init.py
--- a/summa/exception/textrank_runtime_error.py
+++ b/summa/exception/textrank_runtime_error.py
@ -0,0 +1,2 @@
+class TextrankRuntimeError(RuntimeError):
+    pass
--- a/summa/graph.py
+++ b/summa/graph.py
@ -0,0 +1,244 @@
+from abc import ABCMeta, abstractmethod
+
+
+class IGraph(metaclass=ABCMeta):
+    """
+    Represents the interface or contract that the graph for TextRank should implement
+    """
+
+    @abstractmethod
+    def nodes(self):
+        """
+        Return node list.
+
+        @rtype:  list
+        @return: Node list.
+        """
+        pass
+
+
+    @abstractmethod
+    def edges(self):
+        """
+        Return all edges in the graph.
+
+        @rtype:  list
+        @return: List of all edges in the graph.
+        """
+        pass
+
+    @abstractmethod
+    def neighbors(self, node):
+        """
+        Return all nodes that are directly accessible from given node.
+
+        @type  node: node
+        @param node: Node identifier
+
+        @rtype:  list
+        @return: List of nodes directly accessible from given node.
+        """
+        pass
+
+
+    @abstractmethod
+    def has_node(self, node):
+        """
+        Return whether the requested node exists.
+
+        @type  node: node
+        @param node: Node identifier
+
+        @rtype:  boolean
+        @return: Truth-value for node existence.
+        """
+        pass
+
+
+    @abstractmethod
+    def add_node(self, node, attrs=None):
+        """
+        Add given node to the graph.
+
+        @attention: While nodes can be of any type, it's strongly recommended to use only
+        numbers and single-line strings as node identifiers if you intend to use write().
+
+        @type  node: node
+        @param node: Node identifier.
+
+        @type  attrs: list
+        @param attrs: List of node attributes specified as (attribute, value) tuples.
+        """
+        pass
+
+
+    @abstractmethod
+    def add_edge(self, edge, wt=1, label='', attrs=[]):
+        """
+        Add an edge to the graph connecting two nodes.
+
+        An edge, here, is a pair of nodes like C{(n, m)}.
+
+        @type  edge: tuple
+        @param edge: Edge.
+
+        @type  wt: number
+        @param wt: Edge weight.
+
+        @type  label: string
+        @param label: Edge label.
+
+        @type  attrs: list
+        @param attrs: List of node attributes specified as (attribute, value) tuples.
+        """
+        pass
+
+
+    @abstractmethod
+    def has_edge(self, edge):
+        """
+        Return whether an edge exists.
+
+        @type  edge: tuple
+        @param edge: Edge.
+
+        @rtype:  boolean
+        @return: Truth-value for edge existence.
+        """
+        pass
+
+
+    @abstractmethod
+    def edge_weight(self, edge):
+        """
+        Get the weight of an edge.
+
+        @type  edge: edge
+        @param edge: One edge.
+
+        @rtype:  number
+        @return: Edge weight.
+        """
+        pass
+
+
+    @abstractmethod
+    def del_node(self, node):
+        """
+        Remove a node from the graph.
+
+        @type  node: node
+        @param node: Node identifier.
+        """
+        pass
+
+
+class Graph(IGraph):
+    """
+    Implementation of an undirected graph, based on Pygraph
+    """
+
+    WEIGHT_ATTRIBUTE_NAME = "weight"
+    DEFAULT_WEIGHT = 0
+
+    LABEL_ATTRIBUTE_NAME = "label"
+    DEFAULT_LABEL = ""
+
+    def __init__(self):
+        # Metadata about edges
+        self.edge_properties = {}    # Mapping: Edge -> Dict mapping, lablel-> str, wt->num
+        self.edge_attr = {}          # Key value pairs: (Edge -> Attributes)
+        # Metadata about nodes
+        self.node_attr = {}          # Pairing: Node -> Attributes
+        self.node_neighbors = {}     # Pairing: Node -> Neighbors
+
+    def has_edge(self, edge):
+        u,v = edge
+        return (u,v) in self.edge_properties and (v,u) in self.edge_properties
+
+    def edge_weight(self, edge):
+        return self.get_edge_properties( edge ).setdefault( self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT )
+
+    def neighbors(self, node):
+        return self.node_neighbors[node]
+
+    def has_node(self, node):
+        return node in self.node_neighbors
+
+    def add_edge(self, edge, wt=1, label='', attrs=[]):
+        u, v = edge
+        if (v not in self.node_neighbors[u] and u not in self.node_neighbors[v]):
+            self.node_neighbors[u].append(v)
+            if (u != v):
+                self.node_neighbors[v].append(u)
+
+            self.add_edge_attributes((u,v), attrs)
+            self.set_edge_properties((u, v), label=label, weight=wt)
+        else:
+            raise ValueError("Edge (%s, %s) already in graph" % (u, v))
+
+    def add_node(self, node, attrs=None):
+        if attrs is None:
+            attrs = []
+        if (not node in self.node_neighbors):
+            self.node_neighbors[node] = []
+            self.node_attr[node] = attrs
+        else:
+            raise ValueError("Node %s already in graph" % node)
+
+    def nodes(self):
+        return list(self.node_neighbors.keys())
+
+    def edges(self):
+        return [ a for a in list(self.edge_properties.keys()) ]
+
+    def del_node(self, node):
+        for each in list(self.neighbors(node)):
+            if (each != node):
+                self.del_edge((each, node))
+        del(self.node_neighbors[node])
+        del(self.node_attr[node])
+
+    # Helper methods
+    def get_edge_properties(self, edge):
+        return self.edge_properties.setdefault( edge, {} )
+
+    def add_edge_attributes(self, edge, attrs):
+        for attr in attrs:
+            self.add_edge_attribute(edge, attr)
+
+    def add_edge_attribute(self, edge, attr):
+        self.edge_attr[edge] = self.edge_attributes(edge) + [attr]
+
+        if (edge[0] != edge[1]):
+            self.edge_attr[(edge[1],edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr]
+
+    def edge_attributes(self, edge):
+        try:
+            return self.edge_attr[edge]
+        except KeyError:
+            return []
+
+    def set_edge_properties(self, edge, **properties ):
+        self.edge_properties.setdefault( edge, {} ).update( properties )
+        if (edge[0] != edge[1]):
+            self.edge_properties.setdefault((edge[1], edge[0]), {}).update( properties )
+
+    def del_edge(self, edge):
+        u, v = edge
+        self.node_neighbors[u].remove(v)
+        self.del_edge_labeling((u, v))
+        if (u != v):
+            self.node_neighbors[v].remove(u)
+            self.del_edge_labeling((v, u)) # TODO: This is redundant
+
+    def del_edge_labeling( self, edge ):
+        keys = [edge]
+        keys.append(edge[::-1])
+
+        for key in keys:
+            for mapping in [self.edge_properties, self.edge_attr ]:
+                try:
+                    del ( mapping[key] )
+                except KeyError:
+                    pass
--- a/summa/keywords.py
+++ b/summa/keywords.py
@ -0,0 +1,227 @@
+from itertools import combinations as _combinations
+from queue import Queue
+
+from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
+from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
+from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
+from .commons import build_graph as _build_graph
+from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
+
+WINDOW_SIZE = 2
+
+"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
+Example: filter for nouns and adjectives:
+INCLUDING_FILTER = ['NN', 'JJ']"""
+INCLUDING_FILTER = ['NN', 'JJ']
+EXCLUDING_FILTER = []
+
+
+def _get_pos_filters():
+    return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
+
+
+def _get_words_for_graph(tokens):
+    include_filters, exclude_filters = _get_pos_filters()
+    if include_filters and exclude_filters:
+        raise ValueError("Can't use both include and exclude filters, should use only one")
+
+    result = []
+    for word, unit in tokens.items():
+        if exclude_filters and unit.tag in exclude_filters:
+            continue
+        if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
+            result.append(unit.token)
+    return result
+
+
+def _get_first_window(split_text):
+    return split_text[:WINDOW_SIZE]
+
+
+def _set_graph_edge(graph, tokens, word_a, word_b):
+    if word_a in tokens and word_b in tokens:
+        lemma_a = tokens[word_a].token
+        lemma_b = tokens[word_b].token
+        edge = (lemma_a, lemma_b)
+
+        if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
+            graph.add_edge(edge)
+
+
+def _process_first_window(graph, tokens, split_text):
+    first_window = _get_first_window(split_text)
+    for word_a, word_b in _combinations(first_window, 2):
+        _set_graph_edge(graph, tokens, word_a, word_b)
+
+
+def _init_queue(split_text):
+    queue = Queue()
+    first_window = _get_first_window(split_text)
+    for word in first_window[1:]:
+        queue.put(word)
+    return queue
+
+
+def _process_word(graph, tokens, queue, word):
+    for word_to_compare in _queue_iterator(queue):
+        _set_graph_edge(graph, tokens, word, word_to_compare)
+
+
+def _update_queue(queue, word):
+    queue.get()
+    queue.put(word)
+    assert queue.qsize() == (WINDOW_SIZE - 1)
+
+
+def _process_text(graph, tokens, split_text):
+    queue = _init_queue(split_text)
+    for i in range(WINDOW_SIZE, len(split_text)):
+        word = split_text[i]
+        _process_word(graph, tokens, queue, word)
+        _update_queue(queue, word)
+
+
+def _queue_iterator(queue):
+    iterations = queue.qsize()
+    for i in range(iterations):
+        var = queue.get()
+        yield var
+        queue.put(var)
+
+
+def _set_graph_edges(graph, tokens, split_text):
+    _process_first_window(graph, tokens, split_text)
+    _process_text(graph, tokens, split_text)
+
+
+def _extract_tokens(lemmas, scores, ratio, words):
+    lemmas.sort(key=lambda s: scores[s], reverse=True)
+
+    # If no "words" option is selected, the number of sentences is
+    # reduced by the provided ratio, else, the ratio is ignored.
+    length = len(lemmas) * ratio if words is None else words
+    return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]
+
+
+def _lemmas_to_words(tokens):
+    lemma_to_word = {}
+    for word, unit in tokens.items():
+        lemma = unit.token
+        if lemma in lemma_to_word:
+            lemma_to_word[lemma].append(word)
+        else:
+            lemma_to_word[lemma] = [word]
+    return lemma_to_word
+
+
+def _get_keywords_with_score(extracted_lemmas, lemma_to_word):
+    """
+    :param extracted_lemmas:list of tuples
+    :param lemma_to_word: dict of {lemma:list of words}
+    :return: dict of {keyword:score}
+    """
+    keywords = {}
+    for score, lemma in extracted_lemmas:
+        keyword_list = lemma_to_word[lemma]
+        for keyword in keyword_list:
+            keywords[keyword] = score
+    return keywords
+
+
+def _strip_word(word):
+    stripped_word_list = list(_tokenize_by_word(word))
+    return stripped_word_list[0] if stripped_word_list else ""
+
+
+def _get_combined_keywords(_keywords, split_text):
+    """
+    :param keywords:dict of keywords:scores
+    :param split_text: list of strings
+    :return: combined_keywords:list
+    """
+    result = []
+    _keywords = _keywords.copy()
+    len_text = len(split_text)
+    for i in range(len_text):
+        word = _strip_word(split_text[i])
+        if word in _keywords:
+            combined_word = [word]
+            if i + 1 == len_text:
+                result.append(word)   # appends last word if keyword and doesn't iterate
+            for j in range(i + 1, len_text):
+                other_word = _strip_word(split_text[j])
+                if other_word in _keywords and other_word == split_text[j] \
+                        and other_word not in combined_word:
+                    combined_word.append(other_word)
+                else:
+                    for keyword in combined_word:
+                        _keywords.pop(keyword)
+                    result.append(" ".join(combined_word))
+                    break
+    return result
+
+
+def _get_average_score(concept, _keywords):
+    word_list = concept.split()
+    word_counter = 0
+    total = 0
+    for word in word_list:
+        total += _keywords[word]
+        word_counter += 1
+    return total / word_counter
+
+
+def _format_results(_keywords, combined_keywords, split, scores):
+    """
+    :param keywords:dict of keywords:scores
+    :param combined_keywords:list of word/s
+    """
+    combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
+    if scores:
+        return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
+    if split:
+        return combined_keywords
+    return "\n".join(combined_keywords)
+
+
+def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False, deaccent=False, additional_stopwords=None):
+    if not isinstance(text, str):
+        raise ValueError("Text parameter must be a Unicode object (str)!")
+
+    # Gets a dict of word -> lemma
+    tokens = _clean_text_by_word(text, language, deacc=deaccent, additional_stopwords=additional_stopwords)
+    split_text = list(_tokenize_by_word(text))
+
+    # Creates the graph and adds the edges
+    graph = _build_graph(_get_words_for_graph(tokens))
+    _set_graph_edges(graph, tokens, split_text)
+    del split_text # It's no longer used
+
+    _remove_unreachable_nodes(graph)
+
+    # PageRank cannot be run in an empty graph.
+    if len(graph.nodes()) == 0:
+        return [] if split else ""
+
+    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
+    pagerank_scores = _pagerank(graph)
+
+    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
+
+    lemmas_to_word = _lemmas_to_words(tokens)
+    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
+
+    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
+    combined_keywords = _get_combined_keywords(keywords, text.split())
+
+    return _format_results(keywords, combined_keywords, split, scores)
+
+
+def get_graph(text, language="english", deaccent=False):
+    tokens = _clean_text_by_word(text, language, deacc=deaccent)
+    split_text = list(_tokenize_by_word(text, deacc=deaccent))
+
+    graph = _build_graph(_get_words_for_graph(tokens))
+    _set_graph_edges(graph, tokens, split_text)
+
+    return graph
--- a/summa/pagerank_weighted.py
+++ b/summa/pagerank_weighted.py
@ -0,0 +1,86 @@
+from scipy.sparse import csr_matrix
+from scipy.linalg import eig
+from numpy import empty as empty_matrix
+
+CONVERGENCE_THRESHOLD = 0.0001
+
+
+def pagerank_weighted(graph, initial_value=None, damping=0.85):
+    """Calculates PageRank for an undirected graph"""
+    if initial_value == None: initial_value = 1.0 / len(graph.nodes())
+    scores = dict.fromkeys(graph.nodes(), initial_value)
+
+    iteration_quantity = 0
+    for iteration_number in range(100):
+        iteration_quantity += 1
+        convergence_achieved = 0
+        for i in graph.nodes():
+            rank = 1 - damping
+            for j in graph.neighbors(i):
+                neighbors_sum = sum(graph.edge_weight((j, k)) for k in graph.neighbors(j))
+                rank += damping * scores[j] * graph.edge_weight((j, i)) / neighbors_sum
+
+            if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
+                convergence_achieved += 1
+
+            scores[i] = rank
+
+        if convergence_achieved == len(graph.nodes()):
+            break
+
+    return scores
+
+
+def pagerank_weighted_scipy(graph, damping=0.85):
+    adjacency_matrix = build_adjacency_matrix(graph)
+    probability_matrix = build_probability_matrix(graph)
+
+    # Suppress deprecation warnings from numpy.
+    # See https://github.com/summanlp/textrank/issues/57
+    import warnings
+    with warnings.catch_warnings():
+        from numpy import VisibleDeprecationWarning
+        warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
+        warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
+        pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
+
+    vals, vecs = eig(pagerank_matrix, left=True, right=False)
+    return process_results(graph, vecs)
+
+
+def build_adjacency_matrix(graph):
+    row = []
+    col = []
+    data = []
+    nodes = graph.nodes()
+    length = len(nodes)
+
+    for i in range(length):
+        current_node = nodes[i]
+        neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
+        for j in range(length):
+            edge_weight = float(graph.edge_weight((current_node, nodes[j])))
+            if i != j and edge_weight != 0:
+                row.append(i)
+                col.append(j)
+                data.append(edge_weight / neighbors_sum)
+
+    return csr_matrix((data,(row,col)), shape=(length,length))
+
+
+def build_probability_matrix(graph):
+    dimension = len(graph.nodes())
+    matrix = empty_matrix((dimension,dimension))
+
+    probability = 1 / float(dimension)
+    matrix.fill(probability)
+
+    return matrix
+
+
+def process_results(graph, vecs):
+    scores = {}
+    for i, node in enumerate(graph.nodes()):
+        scores[node] = abs(vecs[i][0])
+
+    return scores
--- a/summa/preprocessing/init.py
+++ b/summa/preprocessing/init.py
--- a/summa/preprocessing/pycache/init.cpython-38.pyc
+++ b/summa/preprocessing/pycache/init.cpython-38.pyc
--- a/summa/preprocessing/pycache/porter.cpython-38.pyc
+++ b/summa/preprocessing/pycache/porter.cpython-38.pyc
--- a/summa/preprocessing/pycache/snowball.cpython-38.pyc
+++ b/summa/preprocessing/pycache/snowball.cpython-38.pyc
--- a/summa/preprocessing/pycache/stopwords.cpython-38.pyc
+++ b/summa/preprocessing/pycache/stopwords.cpython-38.pyc
--- a/summa/preprocessing/pycache/textcleaner.cpython-38.pyc
+++ b/summa/preprocessing/pycache/textcleaner.cpython-38.pyc
--- a/summa/preprocessing/pycache/util.cpython-38.pyc
+++ b/summa/preprocessing/pycache/util.cpython-38.pyc
--- a/summa/preprocessing/porter.py
+++ b/summa/preprocessing/porter.py
@ -0,0 +1,635 @@
+# Adapted from the NLTK package v3.0.1:
+# https://github.com/nltk/nltk/blob/3.0.1/nltk/stem/porter.py
+
+# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org).  All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+#
+# This software is maintained by Vivake (vivakeATomniscia.org) and is available at:
+#     http://www.omniscia.org/~vivake/python/PorterStemmer.py
+#
+# Additional modifications were made to incorporate this module into
+# NLTK.  All such modifications are marked with "--NLTK--".  The NLTK
+# version of this module is maintained by NLTK developers,
+# and is available via http://nltk.org/
+#
+# GNU Linking Exception:
+# Using this module statically or dynamically with other modules is
+# making a combined work based on this module. Thus, the terms and
+# conditions of the GNU General Public License cover the whole combination.
+# As a special exception, the copyright holders of this module give
+# you permission to combine this module with independent modules to
+# produce an executable program, regardless of the license terms of these
+# independent modules, and to copy and distribute the resulting
+# program under terms of your choice, provided that you also meet,
+# for each linked independent module, the terms and conditions of
+# the license of that module. An independent module is a module which
+# is not derived from or based on this module. If you modify this module,
+# you may extend this exception to your version of the module, but you
+# are not obliged to do so. If you do not wish to do so, delete this
+# exception statement from your version.
+
+"""
+Porter Stemmer
+
+This is the Porter stemming algorithm, ported to Python from the
+version coded up in ANSI C by the author. It follows the algorithm
+presented in
+
+Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
+
+only differing from it at the points marked --DEPARTURE-- and --NEW--
+below.
+
+For a more faithful version of the Porter algorithm, see
+
+    http://www.tartarus.org/~martin/PorterStemmer/
+
+Later additions:
+
+   June 2000
+
+   The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
+   short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
+
+   This follows a suggestion of Barry Wilkins, research student at
+   Birmingham.
+
+
+   February 2000
+
+   the cvc test for not dropping final -e now looks after vc at the
+   beginning of a word, so are, eve, ice, ore, use keep final -e. In this
+   test c is any consonant, including w, x and y. This extension was
+   suggested by Chris Emerson.
+
+   -fully    -> -ful   treated like  -fulness -> -ful, and
+   -tionally -> -tion  treated like  -tional  -> -tion
+
+   both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
+
+   Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
+
+Additional modifications were made to incorperate this module into
+nltk.  All such modifications are marked with \"--NLTK--\".  The nltk
+version of this module is maintained by the NLTK developers, and is
+available from <http://nltk.sourceforge.net>
+"""
+
+
+## --NLTK--
+## Declare this module's documentation format.
+
+class PorterStemmer():
+
+    ## --NLTK--
+    ## Add a module docstring
+    """
+    A word stemmer based on the Porter stemming algorithm.
+
+        Porter, M. \"An algorithm for suffix stripping.\"
+        Program 14.3 (1980): 130-137.
+
+    A few minor modifications have been made to Porter's basic
+    algorithm.  See the source code of this module for more
+    information.
+
+    The Porter Stemmer requires that all tokens have string types.
+    """
+
+    # The main part of the stemming algorithm starts here.
+    # Note that only lower case sequences are stemmed. Forcing to lower case
+    # should be done before stem(...) is called.
+
+    def __init__(self):
+
+        ## --NEW--
+        ## This is a table of irregular forms. It is quite short, but still
+        ## reflects the errors actually drawn to Martin Porter's attention over
+        ## a 20 year period!
+        ##
+        ## Extend it as necessary.
+        ##
+        ## The form of the table is:
+        ##  {
+        ##  "p1" : ["s11","s12","s13", ... ],
+        ##  "p2" : ["s21","s22","s23", ... ],
+        ##  ...
+        ##  "pn" : ["sn1","sn2","sn3", ... ]
+        ##  }
+        ##
+        ## String sij is mapped to paradigm form pi, and the main stemming
+        ## process is then bypassed.
+
+        irregular_forms = {
+            "sky" :     ["sky", "skies"],
+            "die" :     ["dying"],
+            "lie" :     ["lying"],
+            "tie" :     ["tying"],
+            "news" :    ["news"],
+            "inning" :  ["innings", "inning"],
+            "outing" :  ["outings", "outing"],
+            "canning" : ["cannings", "canning"],
+            "howe" :    ["howe"],
+
+            # --NEW--
+            "proceed" : ["proceed"],
+            "exceed"  : ["exceed"],
+            "succeed" : ["succeed"], # Hiranmay Ghosh
+            }
+
+        self.pool = {}
+        for key in irregular_forms:
+            for val in irregular_forms[key]:
+                self.pool[val] = key
+
+        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
+
+    def _cons(self, word, i):
+        """cons(i) is TRUE <=> b[i] is a consonant."""
+        if word[i] in self.vowels:
+            return False
+        if word[i] == 'y':
+            if i == 0:
+                return True
+            else:
+                return (not self._cons(word, i - 1))
+        return True
+
+    def _m(self, word, j):
+        """m() measures the number of consonant sequences between k0 and j.
+        if c is a consonant sequence and v a vowel sequence, and <..>
+        indicates arbitrary presence,
+
+           <c><v>       gives 0
+           <c>vc<v>     gives 1
+           <c>vcvc<v>   gives 2
+           <c>vcvcvc<v> gives 3
+           ....
+        """
+        n = 0
+        i = 0
+        while True:
+            if i > j:
+                return n
+            if not self._cons(word, i):
+                break
+            i = i + 1
+        i = i + 1
+
+        while True:
+            while True:
+                if i > j:
+                    return n
+                if self._cons(word, i):
+                    break
+                i = i + 1
+            i = i + 1
+            n = n + 1
+
+            while True:
+                if i > j:
+                    return n
+                if not self._cons(word, i):
+                    break
+                i = i + 1
+            i = i + 1
+
+    def _vowelinstem(self, stem):
+        """vowelinstem(stem) is TRUE <=> stem contains a vowel"""
+        for i in range(len(stem)):
+            if not self._cons(stem, i):
+                return True
+        return False
+
+    def _doublec(self, word):
+        """doublec(word) is TRUE <=> word ends with a double consonant"""
+        if len(word) < 2:
+            return False
+        if (word[-1] != word[-2]):
+            return False
+        return self._cons(word, len(word)-1)
+
+    def _cvc(self, word, i):
+        """cvc(i) is TRUE <=>
+
+        a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or
+
+        b) word[i - 2], word[i - 1], word[i] has the form consonant -
+           vowel - consonant and also if the second c is not w, x or y. this
+           is used when trying to restore an e at the end of a short word.
+           e.g.
+
+               cav(e), lov(e), hop(e), crim(e), but
+               snow, box, tray.
+        """
+        if i == 0: return False  # i == 0 never happens perhaps
+        if i == 1: return (not self._cons(word, 0) and self._cons(word, 1))
+        if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False
+
+        ch = word[i]
+        if ch == 'w' or ch == 'x' or ch == 'y':
+            return False
+
+        return True
+
+    def _step1ab(self, word):
+        """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+           caresses  ->  caress
+           ponies    ->  poni
+           sties     ->  sti
+           tie       ->  tie        (--NEW--: see below)
+           caress    ->  caress
+           cats      ->  cat
+
+           feed      ->  feed
+           agreed    ->  agree
+           disabled  ->  disable
+
+           matting   ->  mat
+           mating    ->  mate
+           meeting   ->  meet
+           milling   ->  mill
+           messing   ->  mess
+
+           meetings  ->  meet
+        """
+        if word[-1] == 's':
+            if word.endswith("sses"):
+                word = word[:-2]
+            elif word.endswith("ies"):
+                if len(word) == 4:
+                    word = word[:-1]
+                # this line extends the original algorithm, so that
+                # 'flies'->'fli' but 'dies'->'die' etc
+                else:
+                    word = word[:-2]
+            elif word[-2] != 's':
+                word = word[:-1]
+
+        ed_or_ing_trimmed = False
+        if word.endswith("ied"):
+            if len(word) == 4:
+                word = word[:-1]
+            else:
+                word = word[:-2]
+        # this line extends the original algorithm, so that
+        # 'spied'->'spi' but 'died'->'die' etc
+
+        elif word.endswith("eed"):
+            if self._m(word, len(word)-4) > 0:
+                word = word[:-1]
+
+
+        elif word.endswith("ed") and self._vowelinstem(word[:-2]):
+            word = word[:-2]
+            ed_or_ing_trimmed = True
+        elif word.endswith("ing") and self._vowelinstem(word[:-3]):
+            word = word[:-3]
+            ed_or_ing_trimmed = True
+
+        if ed_or_ing_trimmed:
+            if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
+                word += 'e'
+            elif self._doublec(word):
+                if word[-1] not in ['l', 's', 'z']:
+                    word = word[:-1]
+            elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)):
+                word += 'e'
+
+        return word
+
+    def _step1c(self, word):
+        """step1c() turns terminal y to i when there is another vowel in the stem.
+        --NEW--: This has been modified from the original Porter algorithm so that y->i
+        is only done when y is preceded by a consonant, but not if the stem
+        is only a single consonant, i.e.
+
+           (*c and not c) Y -> I
+
+        So 'happy' -> 'happi', but
+          'enjoy' -> 'enjoy'  etc
+
+        This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
+        'enjoy'. Step 1c is perhaps done too soon; but with this modification that
+        no longer really matters.
+
+        Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
+        'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
+        'flies' ...
+        """
+        if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2):
+            return word[:-1] + 'i'
+        else:
+            return word
+
+    def _step2(self, word):
+        """step2() maps double suffices to single ones.
+        so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+        string before the suffix must give m() > 0.
+        """
+        if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
+            return word
+
+        ch = word[-2]
+
+        if ch == 'a':
+            if word.endswith("ational"):
+                return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word
+            elif word.endswith("tional"):
+                return word[:-2] if self._m(word, len(word)-7) > 0 else word
+            else:
+                return word
+        elif ch == 'c':
+            if word.endswith("enci"):
+                return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word
+            elif word.endswith("anci"):
+                return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word
+            else:
+                return word
+        elif ch == 'e':
+            if word.endswith("izer"):
+                return word[:-1] if self._m(word, len(word)-5) > 0 else word
+            else:
+                return word
+        elif ch == 'l':
+            if word.endswith("bli"):
+                return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE--
+            # To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
+            elif word.endswith("alli"):
+                # --NEW--
+                if self._m(word, len(word)-5) > 0:
+                    word = word[:-2]
+                    return self._step2(word)
+                else:
+                    return word
+            elif word.endswith("fulli"):
+                return word[:-2] if self._m(word, len(word)-6) else word # --NEW--
+            elif word.endswith("entli"):
+                return word[:-2] if self._m(word, len(word)-6) else word
+            elif word.endswith("eli"):
+                return word[:-2] if self._m(word, len(word)-4) else word
+            elif word.endswith("ousli"):
+                return word[:-2] if self._m(word, len(word)-6) else word
+            else:
+                return word
+        elif ch == 'o':
+            if word.endswith("ization"):
+                return word[:-7] + "ize" if self._m(word, len(word)-8) else word
+            elif word.endswith("ation"):
+                return word[:-5] + "ate" if self._m(word, len(word)-6) else word
+            elif word.endswith("ator"):
+                return word[:-4] + "ate" if self._m(word, len(word)-5) else word
+            else:
+                return word
+        elif ch == 's':
+            if word.endswith("alism"):
+                return word[:-3] if self._m(word, len(word)-6) else word
+            elif word.endswith("ness"):
+                if word.endswith("iveness"):
+                    return word[:-4] if self._m(word, len(word)-8) else word
+                elif word.endswith("fulness"):
+                    return word[:-4] if self._m(word, len(word)-8) else word
+                elif word.endswith("ousness"):
+                    return word[:-4] if self._m(word, len(word)-8) else word
+                else:
+                    return word
+            else:
+                return word
+        elif ch == 't':
+            if word.endswith("aliti"):
+                return word[:-3] if self._m(word, len(word)-6) else word
+            elif word.endswith("iviti"):
+                return word[:-5] + "ive" if self._m(word, len(word)-6) else word
+            elif word.endswith("biliti"):
+                return word[:-6] + "ble" if self._m(word, len(word)-7) else word
+            else:
+                return word
+        elif ch == 'g': # --DEPARTURE--
+            if word.endswith("logi"):
+                return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins)
+            # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
+            else:
+                return word
+
+        else:
+            return word
+
+    def _step3(self, word):
+        """step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
+
+        ch = word[-1]
+
+        if ch == 'e':
+            if word.endswith("icate"):
+                return word[:-3] if self._m(word, len(word)-6) else word
+            elif word.endswith("ative"):
+                return word[:-5] if self._m(word, len(word)-6) else word
+            elif word.endswith("alize"):
+                return word[:-3] if self._m(word, len(word)-6) else word
+            else:
+                return word
+        elif ch == 'i':
+            if word.endswith("iciti"):
+                return word[:-3] if self._m(word, len(word)-6) else word
+            else:
+                return word
+        elif ch == 'l':
+            if word.endswith("ical"):
+                return word[:-2] if self._m(word, len(word)-5) else word
+            elif word.endswith("ful"):
+                return word[:-3] if self._m(word, len(word)-4) else word
+            else:
+                return word
+        elif ch == 's':
+            if word.endswith("ness"):
+                return word[:-4] if self._m(word, len(word)-5) else word
+            else:
+                return word
+
+        else:
+            return word
+
+    def _step4(self, word):
+        """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+
+        if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
+            return word
+
+        ch = word[-2]
+
+        if ch == 'a':
+            if word.endswith("al"):
+                return word[:-2] if self._m(word, len(word)-3) > 1 else word
+            else:
+                return word
+        elif ch == 'c':
+            if word.endswith("ance"):
+                return word[:-4] if self._m(word, len(word)-5) > 1 else word
+            elif word.endswith("ence"):
+                return word[:-4] if self._m(word, len(word)-5) > 1 else word
+            else:
+                return word
+        elif ch == 'e':
+            if word.endswith("er"):
+                return word[:-2] if self._m(word, len(word)-3) > 1 else word
+            else:
+                return word
+        elif ch == 'i':
+            if word.endswith("ic"):
+                return word[:-2] if self._m(word, len(word)-3) > 1 else word
+            else:
+                return word
+        elif ch == 'l':
+            if word.endswith("able"):
+                return word[:-4] if self._m(word, len(word)-5) > 1 else word
+            elif word.endswith("ible"):
+                return word[:-4] if self._m(word, len(word)-5) > 1 else word
+            else:
+                return word
+        elif ch == 'n':
+            if word.endswith("ant"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            elif word.endswith("ement"):
+                return word[:-5] if self._m(word, len(word)-6) > 1 else word
+            elif word.endswith("ment"):
+                return word[:-4] if self._m(word, len(word)-5) > 1 else word
+            elif word.endswith("ent"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        elif ch == 'o':
+            if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            elif word.endswith("ou"):
+                return word[:-2] if self._m(word, len(word)-3) > 1 else word
+            else:
+                return word
+        elif ch == 's':
+            if word.endswith("ism"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        elif ch == 't':
+            if word.endswith("ate"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            elif word.endswith("iti"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        elif ch == 'u':
+            if word.endswith("ous"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        elif ch == 'v':
+            if word.endswith("ive"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        elif ch == 'z':
+            if word.endswith("ize"):
+                return word[:-3] if self._m(word, len(word)-4) > 1 else word
+            else:
+                return word
+        else:
+            return word
+
+    def _step5(self, word):
+        """step5() removes a final -e if m() > 1, and changes -ll to -l if
+        m() > 1.
+        """
+        if word[-1] == 'e':
+            a = self._m(word, len(word)-1)
+            if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)):
+                word = word[:-1]
+        if word.endswith('ll') and self._m(word, len(word)-1) > 1:
+            word = word[:-1]
+
+        return word
+
+    def stem_word(self, p, i=0, j=None):
+        """
+        Returns the stem of p, or, if i and j are given, the stem of p[i:j+1].
+        """
+        ## --NLTK--
+        if j is None and i == 0:
+            word = p
+        else:
+            if j is None:
+                j = len(p) - 1
+            word = p[i:j+1]
+
+        if word in self.pool:
+            return self.pool[word]
+
+        if len(word) <= 2:
+            return word # --DEPARTURE--
+        # With this line, strings of length 1 or 2 don't go through the
+        # stemming process, although no mention is made of this in the
+        # published algorithm. Remove the line to match the published
+        # algorithm.
+
+        word = self._step1ab(word)
+        word = self._step1c(word)
+        word = self._step2(word)
+        word = self._step3(word)
+        word = self._step4(word)
+        word = self._step5(word)
+        return word
+
+    def _adjust_case(self, word, stem):
+        lower = word.lower()
+
+        ret = ""
+        for x in range(len(stem)):
+            if lower[x] == stem[x]:
+                ret += word[x]
+            else:
+                ret += stem[x]
+
+        return ret
+
+    ## --NLTK--
+    ## Don't use this procedure; we want to work with individual
+    ## tokens, instead.  (commented out the following procedure)
+    #def stem(self, text):
+    #    parts = re.split("(\W+)", text)
+    #    numWords = (len(parts) + 1)/2
+    #
+    #    ret = ""
+    #    for i in xrange(numWords):
+    #        word = parts[2 * i]
+    #        separator = ""
+    #        if ((2 * i) + 1) < len(parts):
+    #            separator = parts[(2 * i) + 1]
+    #
+    #        stem = self.stem_word(string.lower(word), 0, len(word) - 1)
+    #        ret = ret + self.adjust_case(word, stem)
+    #        ret = ret + separator
+    #    return ret
+
+    ## --NLTK--
+    ## Define a stem() method that implements the StemmerI interface.
+    def stem(self, word):
+        stem = self.stem_word(word.lower(), 0, len(word) - 1)
+        return self._adjust_case(word, stem)
+
+    ## --NLTK--
+    ## Add a string representation function
+    def __repr__(self):
+        return '<PorterStemmer>'
--- a/summa/preprocessing/snowball.py
+++ b/summa/preprocessing/snowball.py
--- a/summa/preprocessing/stopwords.py
+++ b/summa/preprocessing/stopwords.py
@ -0,0 +1,210 @@
+english = """
+all six eleven just less being indeed over both anyway detail four front already through yourselves fify
+mill still its before move whose one system also somewhere herself thick show had enough should to only
+seeming under herein ours two has might thereafter do them his around thereby get very de none cannot
+every whether they not during thus now him nor name regarding several hereafter did always cry whither
+beforehand this someone she each further become thereupon where side towards few twelve because often ten
+anyhow doing km eg some back used go namely besides yet are cant our beyond ourselves sincere out even
+what throughout computer give for bottom mine since please while per find everything behind does various
+above between kg neither seemed ever across t somehow be we who were sixty however here otherwise whereupon
+nowhere although found hers re along quite fifteen by on about didn last would anything via of could thence
+put against keep etc s became ltd hence therein onto or whereafter con among own co afterwards formerly
+within seems into others whatever yourself down alone everyone done least another whoever moreover couldnt
+must your three from her their together top there due been next anyone whom much call too interest thru
+themselves hundred was until empty more himself elsewhere mostly that fire becomes becoming hereby but
+else part everywhere former don with than those he me forty myself made full twenty these bill using up us
+will nevertheless below anywhere nine can theirs toward my something and sometimes whenever sometime then
+almost wherever is describe am it doesn an really as itself at have in seem whence ie any if again hasnt
+inc un thin no perhaps latter meanwhile when amount same wherein beside how other take which latterly you
+fill either nobody unless whereas see though may after upon therefore most hereupon eight amongst never
+serious nothing such why a off whereby third i whole noone many well except amoungst yours rather without
+so five the first having once
+"""
+
+spanish = """
+un una unas unos uno sobre todo tambien tras otro algun alguno alguna algunos algunas ser es soy eres somos
+sois estoy esta estamos estais estan como en para atras porque por que estado estaba ante antes siendo ambos
+pero por poder puede puedo podemos podeis pueden fui fue fuimos fueron hacer hago hace hacemos haceis hacen
+cada fin incluso primero desde conseguir consigo consigue consigues conseguimos consiguen ir voy va vamos
+vais van vaya gueno ha tener tengo tiene tenemos teneis tienen el la lo las los su aqui mio tuyo ellos ellas
+nos nosotros vosotros vosotras si dentro solo solamente saber sabes sabe sabemos sabeis saben ultimo largo
+bastante haces muchos aquellos aquellas sus entonces tiempo verdad verdadero verdadera cierto ciertos cierta
+ciertas intentar intento intenta intentas intentamos intentais intentan dos bajo arriba encima usar uso usas
+usa usamos usais usan emplear empleo empleas emplean ampleamos empleais valor muy era eras eramos eran modo
+bien cual cuando donde mientras quien con entre sin trabajo trabajar trabajas trabaja trabajamos trabajais
+trabajan podria podrias podriamos podrian podriais yo aquel a acabar actualmente acuerdo adelante ademas
+ademas adrede afirmo agrego ahi ahora ahi al algo alguna algunas alguno algunos algun alla alli alli alrededor
+ambos antano antano ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui
+aquel aquella aquellas aquellos aqui arribaabajo aseguro asi asi aun aunque ayer anadio aun b bajo bastante
+bien breve buen buena buenas bueno buenos c cada casi cerca cierto cinco claro comento como con conmigo
+conocer considera considero contigo contra cosa cosas creo cual cuales cualquier cuando cuanta cuantas cuanto
+cuantos cuatro cuenta cuyo cual cuales cuando cuanta cuantas cuanto cuantos como d da dado dan dar de debajo
+debe deben deber debido decir dejo del delante demasiado demas dentro deprisa desde despacio despues despues
+detras detras dia dias dice dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante dia
+dias donde e ejemplo el ella ellas ello ellos embargo en encima encuentra enfrente enseguida entonces entre
+era erais eramos eran eras eres es esa esas ese eso esos esta estaba estabais estabamos estaban estabas estad
+estada estadas estado estados estais estamos estan estando estar estara estaran estaras estare estareis
+estaremos estaria estariais estariamos estarian estarias estara estas este esteis estemos esten estes esto
+estos estoy estuve estuviera estuvierais estuvieramos estuvieran estuvieras estuvieron estuviese estuvieseis
+estuviesemos estuviesen estuvieses estuvimos estuviste estuvisteis estuvo esta estan ex excepto existe existen
+explico expreso f fin final fue fuera fuerais fueramos fueran fueras fueron fuese fueseis fuesemos fuesen
+fueses fui fuimos fuiste fuisteis g general gran grande grandes gustar h ha habeis haber habia habiais habiamos
+habian habias habida habidas habido habidos habiendo habla hablan habra habran habras habre habreis habremos
+habria habriais habriamos habrian habrias habra habia habian hace hacen hacer hacerlo hacia haciendo han has
+hasta hay haya hayais hayamos hayan hayas he hecho hemos hicieron hizo horas hoy hube hubiera hubierais
+hubieramos hubieran hubieras hubieron hubiese hubieseis hubiesemos hubiesen hubieses hubimos hubiste hubisteis
+hubo i igual incluso indico informo informo ir j jamas junto k l la lado las le lejos les llego lleva llevar
+lo los luego lugar m mal manera manifesto mas mayor me mediante medio mejor menciono menos menudo mi mia mias
+mientras mio mios mis misma mismas mismo mismos momento mucha muchas mucho muchos muy mas mi mia mias mio mios
+n nada nadie ni ningun ninguna ningunas ninguno ningunos ningun no nos nosotras nosotros nuestra nuestras
+nuestro nuestros nueva nuevas nuevo nuevos nunca o ocho os otra otras otro otros p pais para parece parte
+partir pasada pasado pasar pais peor pequeno pero pesar poca pocas poco pocos podemos poder podra podran podria
+podrian poner por porque posible primer primera primero primeros principalmente pronto propia propias propio
+propios proximo proximo proximos pudo pueda puede pueden pues q qeu que quedo queremos querer quien quienes
+quiere quiza quizas quiza quizas quien quienes que r raras realizado realizar realizo repente respecto s saber
+salvo se sea seais seamos sean seas seguir segun segunda segundo segun seis senor senora ser sera seran seras
+sere sereis seremos seria seriais seriamos serian serias sera seran seria senalo si sido siempre siendo siete
+sigue siguiente sin sino sisi sobre sois sola solamente solas solo solos somos son soy soyos su supuesto sus
+suya suyas suyo suyos se si solo t tal tambien tambien tampoco tan tanto tarde te temprano tendra tendran
+tendras tendre tendreis tendremos tendria tendriais tendriamos tendrian tendrias tendra tendran tened teneis
+tenemos tener tenga tengais tengamos tengan tengas tengo tenia teniais teniamos tenian tenias tenida tenidas
+tenido tenidos teniendo tenia tercera ti tiene tienen tienes toda todas todavia todavia todo todos tomar total
+tras trata traves tres tu tus tuve tuviera tuvierais tuvieramos tuvieran tuvieras tuvieron tuviese tuvieseis
+tuviesemos tuviesen tuvieses tuvimos tuviste tuvisteis tuvo tuya tuyas tuyo tuyos tu u un una unas uno unos
+usted ustedes v va vamos van varias varios veces venir ver vez volver vosotras vosotros vuestra vuestras vuestro
+vuestros w x y ya yo z el esa esas ese esos esta estas este estos ultima ultimas ultimo ultimos
+"""
+
+german = """
+aber als am an auch auf aus bei bin bis bist da dadurch daher darum das daß dass dein deine dem den der des
+dessen deshalb die dies dieser dieses doch dort du durch ein eine einem einen einer eines er es euer eure fur
+hatte hatten hattest hattet hierhinter ich ihr ihre im in ist ja jede jedem jeden jeder jedes jener jenes jetzt
+kann kannst konnen konnt machen mein meine mit muß mußt musst mussen mußt nach nachdem nein nicht nun oder seid
+sein seine sich sie sind soll sollen sollst sollt sonst soweit sowie und unserunsere unter vom von vor wann
+warum was weiter weitere wenn wer werde werden werdet weshalb wie wieder wieso wir wird wirst wo woher wohin zu
+zum zur uber
+"""
+
+portuguese = """
+de a o que e do da em um para é com não uma os no se na por mais as dos como mas foi ao ele das tem à seu
+sua ou ser quando muito há nos já está eu também só pelo pela até isso ela entre era depois sem mesmo aos ter
+seus quem nas me esse eles estão você tinha foram essa num nem suas meu às minha têm numa pelos elas havia seja
+qual será nós tenho lhe deles essas esses pelas este fosse dele tu te vocês vos lhes meus minhas teu tua teus
+tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está
+estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos
+estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei há havemos hão houve houvemos houveram
+houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá
+houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos
+seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho
+tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse
+tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam
+"""
+
+swedish = """
+aderton adertonde adjö aldrig alla allas allt alltid alltså andra andras annan annat artonde artonn att av bakom
+bara behöva behövas behövde behövt beslut beslutat beslutit bland blev bli blir blivit bort borta bra bäst bättre
+båda bådas dag dagar dagarna dagen de del delen dem den denna deras dess dessa det detta dig din dina dit ditt
+dock dom du där därför då e efter eftersom ej elfte eller elva emot en enkel enkelt enkla enligt ens er era ers
+ert ett ettusen fanns fem femte femtio femtionde femton femtonde fick fin finnas finns fjorton fjortonde fjärde
+fler flera flesta fram framför från fyra fyrtio fyrtionde få får fått följande för före förlåt förra första
+genast genom gick gjorde gjort god goda godare godast gott gälla gäller gällt gärna gå går gått gör göra ha hade
+haft han hans har heller hellre helst helt henne hennes hit hon honom hundra hundraen hundraett hur här hög höger
+högre högst i ibland icke idag igen igår imorgon in inför inga ingen ingenting inget innan inne inom inte inuti
+ja jag jo ju just jämfört kan kanske knappast kom komma kommer kommit kr kunde kunna kunnat kvar legat ligga
+ligger lika likställd likställda lilla lite liten litet länge längre längst lätt lättare lättast långsam
+långsammare långsammast långsamt långt låt man med mej mellan men mer mera mest mig min mina mindre minst mitt
+mittemot mot mycket många måste möjlig möjligen möjligt möjligtvis ned nederst nedersta nedre nej ner ni nio
+nionde nittio nittionde nitton nittonde nog noll nr nu nummer när nästa någon någonting något några nån nånting
+nåt nödvändig nödvändiga nödvändigt nödvändigtvis och också ofta oftast olika olikt om oss på rakt redan rätt sa
+sade sagt samma sedan senare senast sent sex sextio sextionde sexton sextonde sig sin sina sist sista siste sitt
+sitta sju sjunde sjuttio sjuttionde sjutton sjuttonde själv sjätte ska skall skulle slutligen små smått snart som
+stor stora stort större störst säga säger sämre sämst så sådan sådana sådant ta tack tar tidig tidigare tidigast
+tidigt till tills tillsammans tio tionde tjugo tjugoen tjugoett tjugonde tjugotre tjugotvå tjungo tolfte tolv tre
+tredje trettio trettionde tretton trettonde två tvåhundra under upp ur ursäkt ut utan utanför ute va vad var vara
+varför varifrån varit varje varken vars varsågod vart vem vems verkligen vi vid vidare viktig viktigare viktigast
+viktigt vilka vilkas vilken vilket vill väl vänster vänstra värre vår våra vårt än ännu är även åt åtminstone
+åtta åttio åttionde åttonde över övermorgon överst övre
+"""
+
+danish = """
+ad af aldrig alle alt anden andet andre at bare begge blev blive bliver da de dem den denne der deres det dette
+dig din dine disse dit dog du efter ej eller en end ene eneste enhver er et far fem fik fire flere fleste for
+fordi forrige fra få får før god godt ham han hans har havde have hej helt hende hendes her hos hun hvad hvem
+hver hvilken hvis hvor hvordan hvorfor hvornår i ikke ind ingen intet ja jeg jer jeres jo kan kom komme kommer
+kun kunne lad lav lidt lige lille man mand mange med meget men mens mere mig min mine mit mod må ned nej ni nogen
+noget nogle nu ny nyt når nær næste næsten og også okay om op os otte over på se seks selv ser ses sig sige
+sin sine sit skal skulle som stor store syv så sådan tag tage thi ti til to tre ud under var ved vi vil ville
+vor vores være været alene allerede alligevel altid bag blandt burde bør dens derefter derfor derfra deri dermed
+derpå derved egen ellers endnu ens enten flest foran først gennem gjorde gjort gør gøre gørende hel heller hen
+henover herefter heri hermed herpå hvilke hvilkes hvorefter hvorfra hvorhen hvori hvorimod hvorved igen igennem
+imellem imens imod indtil langs lave lavet ligesom længere mellem mest mindre mindst måske nemlig nogensinde nok
+omkring overalt samme sammen selvom senere siden stadig synes syntes således temmelig tidligere tilbage tit uden
+udover undtagen via vore vær øvrigt
+"""
+
+italian = """
+un avete dal voi nostri avesti stiano starò sull tutto faccio sarai vostri farebbe ai degli farò c faccia lo 
+sullo farà facevate avendo fummo stiamo staranno questi sia con sue al mio fareste ero di e avessi alle avreste 
+avesse alla avrei avemmo col ad ne avremmo avevano tuo avessero siate suoi facevo ti che mi questa avrebbe fossero 
+tua starebbero faceste facesti anche cui ho tra foste stavamo non stessi avevate nostre quelli queste avrete eri 
+facemmo stavate stia in dagl avrò avremo se feci furono io stavano nelle quante per abbiano nell faceva fecero steste
+eravamo farei sarei avevi sui quanto dai dello era loro su quello fossi stava nostra quale una farete gli siano avranno
+i stette fece negli facciano facevano dove vostra farebbero sugli vostro uno aveva dall ha avuto avuti sarete sulla sarà 
+perché essendo fai siete facendo da avevamo starà o faranno lei mie stiate nel fu facciamo stessero noi facciate stando 
+si è avute sarebbero miei sto contro avrà coi chi ci avrebbero aveste stettero abbiamo sarebbe agl del stareste sua faremo 
+siamo fanno sei abbiate fui ed quella dalle facessero tue fosti facevamo erano stessimo nei facessimo nello le dell abbia 
+fosse farai facesse starai stavo staremo mia stesse avevo lui agli fossimo dagli vostre stanno sareste quanti stemmo facessi 
+ebbe stesti tuoi dallo tutti sugl staremmo vi la dei quanta ebbero stavi saranno delle dalla saresti staresti stai suo nostro 
+aremo starete saremmo sarò li hai allo avresti dov avuta faresti starei il quelle degl all a ebbi nella eravate stetti negl 
+come questo facevi sulle più tu della sono starebbe sul hanno faremmo sta avrai avessimo ma l
+"""
+
+# stopwords from https://github.com/bieli/stopwords repository
+polish = """
+a aby ach acz aczkolwiek aj albo ale alez ależ ani az aż bardziej bardzo beda bedzie bez deda będą bede będę 
+będzie bo bowiem by byc być byl byla byli bylo byly był była było były bynajmniej cala cali caly cała cały ci 
+cie ciebie cię co cokolwiek cos coś czasami czasem czemu czy czyli daleko dla dlaczego dlatego do dobrze 
+dokad dokąd dosc dość duzo dużo dwa dwaj dwie dwoje dzis dzisiaj dziś gdy gdyby gdyz gdyż gdzie gdziekolwiek 
+gdzies gdzieś go i ich ile im inna inne inny innych iz iż ja jak jakas jakaś jakby jaki jakichs jakichś jakie 
+jakis jakiś jakiz jakiż jakkolwiek jako jakos jakoś ją je jeden jedna jednak jednakze jednakże jedno jego jej 
+jemu jesli jest jestem jeszcze jeśli jezeli jeżeli juz już kazdy każdy kiedy kilka kims kimś kto ktokolwiek 
+ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos ktoś która które którego której który których 
+którym którzy ku lat lecz lub ma mają mało mam mi miedzy między mimo mna mną mnie moga mogą moi moim moj 
+moja moje moze mozliwe mozna może możliwe można mój mu musi my na nad nam nami nas nasi nasz nasza nasze 
+naszego naszych natomiast natychmiast nawet nia nią nic nich nie niech niego niej niemu nigdy nim nimi niz 
+niż no o obok od około on ona one oni ono oraz oto owszem pan pana pani po pod podczas pomimo ponad poniewaz 
+ponieważ powinien powinna powinni powinno poza prawie przeciez przecież przed przede przedtem przez przy roku 
+rowniez również sam sama są sie się skad skąd soba sobą sobie sposob sposób swoje ta tak taka taki takie 
+takze także tam te tego tej ten teraz też to toba tobą tobie totez toteż totobą trzeba tu tutaj twoi twoim 
+twoj twoja twoje twój twym ty tych tylko tym u w wam wami was wasz wasza wasze we według wiele wielu więc 
+więcej wlasnie właśnie wszyscy wszystkich wszystkie wszystkim wszystko wtedy wy z za zaden zadna zadne 
+zadnych zapewne zawsze ze zeby zeznowu zł znow znowu znów zostal został żaden żadna żadne żadnych że żeby
+"""
+
+arabic = """أنت كليكما اللتان بنا هما إذا اللواتي أينما كلاهما إما كيت إذ هم ليس كيف لك هن لئن ألا عليك وإن إليكما أيها لعل أنتن كأي لسن ممن له 
+حين اللتين فيها عسى ما هي أين ليسا هنا بما عما هاته ذاك لدى هاك نحو بكم ذواتا هذا أقل اللتيا إن مع لكما بكما قد لي أولئك إليك أن كلا 
+ليسوا بس ذات فيه منها ومن هو بها كأنما هاهنا هاتان هذي ذلك كما أوه هكذا ذوا ليست لكي نعم لكن خلا لكم أنا بخ تي فلا حبذا أولاء 
+ذواتي منذ ولو بين لكنما سوى آها تلك إي آي إذما الذي كليهما لكيلا لهما بعض يا بكن حيثما وإذا بهما ذا ها فيما ماذا والذين لستما كل 
+لوما ثمة متى عند في هيهات أما ذان الذين وهو أنتم كي آه ذي إذن إليكم بل فإن وإذ تلكما هلا فإذا هذه ذلكم فمن إلا إنا بمن كذلك هاتين 
+عليه كأن هل ذلكما مهما شتان والذي هيا ذين لستن بك مذ ولا هذين كأين فيم حتى إنما بهن هنالك أم لسنا غير لنا منه نحن اللاتي بعد تينك 
+ذلكن ولكن كلما إيه عدا لها هذان ته حاشا دون أنى عن تين أكثر كلتا إنه بيد كذا هاتي ذو لست لم إليكن وما مما إلى ذانك اللذين من مه أف 
+كم اللائي حيث ليستا هؤلاء بماذا ليت هيت بهم لهن التي لولا لو لهم هناك ثم سوف كيفما لستم لما ذينك بلى لا تلكم على لاسيما به بي اللذان أي ذه لن عل أو ريث أنتما
+"""
+
+LANGUAGES = {
+    "danish": danish,
+    "english": english,
+    "german": german,
+    "spanish": spanish,
+    "portuguese": portuguese,
+    "swedish": swedish,
+    "italian": italian,
+    "polish": polish,
+    "arabic": arabic
+}
+
+
+def get_stopwords_by_language(language):
+    if language in LANGUAGES:
+        return LANGUAGES[language]
+    return ""
--- a/summa/preprocessing/textcleaner.py
+++ b/summa/preprocessing/textcleaner.py
@ -0,0 +1,188 @@
+import string
+import unicodedata
+import logging
+
+logger = logging.getLogger('summa.preprocessing.cleaner')
+
+try:
+    from pattern.en import tag
+    logger.info("'pattern' package found; tag filters are available for English")
+    HAS_PATTERN = True
+except ImportError:
+    logger.info("'pattern' package not found; tag filters are not available for English")
+    HAS_PATTERN = False
+
+import re
+
+from .snowball import SnowballStemmer
+from .stopwords import get_stopwords_by_language
+from summa.syntactic_unit import SyntacticUnit
+
+
+# Utility functions adapted from Gensim v0.10.0:
+# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/utils.py
+# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/parsing/preprocessing.py
+
+
+SEPARATOR = r"@"
+RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)')
+AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)")
+AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)")
+AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.")
+UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)")
+UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)")
+
+STEMMER = None
+STOPWORDS = None
+
+
+def set_stemmer_language(language):
+    global STEMMER
+    if not language in SnowballStemmer.languages:
+        raise ValueError("Valid languages are: " + ", ".join(sorted(SnowballStemmer.languages)))
+    STEMMER = SnowballStemmer(language)
+
+
+def set_stopwords_by_language(language, additional_stopwords):
+    global STOPWORDS
+    words = get_stopwords_by_language(language)
+    if not additional_stopwords:
+        additional_stopwords = {}
+    STOPWORDS = frozenset({ w for w in words.split() if w } | { w for w in additional_stopwords if w })
+
+
+def init_textcleanner(language, additional_stopwords):
+    set_stemmer_language(language)
+    set_stopwords_by_language(language, additional_stopwords)
+
+
+def split_sentences(text):
+    processed = replace_abbreviations(text)
+    return [undo_replacement(sentence) for sentence in get_sentences(processed)]
+
+
+def replace_abbreviations(text):
+    return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
+
+
+def undo_replacement(sentence):
+    return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
+
+
+def replace_with_separator(text, separator, regexs):
+    replacement = r"\1" + separator + r"\2"
+    result = text
+    for regex in regexs:
+        result = regex.sub(replacement, result)
+    return result
+
+
+def get_sentences(text):
+    for match in RE_SENTENCE.finditer(text):
+        yield match.group()
+
+
+# Taken from Gensim
+RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
+def strip_punctuation(s):
+    return RE_PUNCT.sub(" ", s)
+
+
+# Taken from Gensim
+RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
+def strip_numeric(s):
+    return RE_NUMERIC.sub("", s)
+
+
+def remove_stopwords(sentence):
+    return " ".join(w for w in sentence.split() if w not in STOPWORDS)
+
+
+def stem_sentence(sentence):
+    word_stems = [STEMMER.stem(word) for word in sentence.split()]
+    return " ".join(word_stems)
+
+
+def apply_filters(sentence, filters):
+    for f in filters:
+        sentence = f(sentence)
+    return sentence
+
+
+def filter_words(sentences):
+    filters = [lambda x: x.lower(), strip_numeric, strip_punctuation, remove_stopwords,
+               stem_sentence]
+    apply_filters_to_token = lambda token: apply_filters(token, filters)
+    return list(map(apply_filters_to_token, sentences))
+
+
+# Taken from Gensim
+def deaccent(text):
+    """
+    Remove accentuation from the given string.
+    """
+    norm = unicodedata.normalize("NFD", text)
+    result = "".join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
+    return unicodedata.normalize("NFC", result)
+
+
+# Taken from Gensim
+PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
+def tokenize(text, lowercase=False, deacc=False):
+    """
+    Iteratively yield tokens as unicode strings, optionally also lowercasing them
+    and removing accent marks.
+    """
+    if lowercase:
+        text = text.lower()
+    if deacc:
+        text = deaccent(text)
+    for match in PAT_ALPHABETIC.finditer(text):
+        yield match.group()
+
+
+def merge_syntactic_units(original_units, filtered_units, tags=None):
+    units = []
+    for i in range(len(original_units)):
+        if filtered_units[i] == '':
+            continue
+
+        text = original_units[i]
+        token = filtered_units[i]
+        tag = tags[i][1] if tags else None
+        sentence = SyntacticUnit(text, token, tag)
+        sentence.index = i
+
+        units.append(sentence)
+
+    return units
+
+
+def clean_text_by_sentences(text, language="english", additional_stopwords=None):
+    """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
+    Returns a SyntacticUnit list. """
+    init_textcleanner(language, additional_stopwords)
+    original_sentences = split_sentences(text)
+    filtered_sentences = filter_words(original_sentences)
+
+    return merge_syntactic_units(original_sentences, filtered_sentences)
+
+
+def clean_text_by_word(text, language="english", deacc=False, additional_stopwords=None):
+    """ Tokenizes a given text into words, applying filters and lemmatizing them.
+    Returns a dict of word -> syntacticUnit. """
+    init_textcleanner(language, additional_stopwords)
+    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
+    original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc))
+    filtered_words = filter_words(original_words)
+    if HAS_PATTERN:
+        tags = tag(" ".join(original_words))  # tag needs the context of the words in the text
+    else:
+        tags = None
+    units = merge_syntactic_units(original_words, filtered_words, tags)
+    return { unit.text : unit for unit in units }
+
+
+def tokenize_by_word(text, deacc=False):
+    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
+    return tokenize(text_without_acronyms, lowercase=True, deacc=deacc)
--- a/summa/preprocessing/util.py
+++ b/summa/preprocessing/util.py
@ -0,0 +1,24 @@
+# Natural Language Toolkit: Stemmer Utilities
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Helder <he7d3r@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+def suffix_replace(original, old, new):
+    """
+    Replaces the old suffix of the original string by a new suffix
+    """
+    return original[: -len(old)] + new
+
+
+def prefix_replace(original, old, new):
+    """
+     Replaces the old prefix of the original string by a new suffix
+    :param original: string
+    :param old: string
+    :param new: string
+    :return: string
+    """
+    return new + original[len(old) :]
--- a/summa/summarizer.py
+++ b/summa/summarizer.py
@ -0,0 +1,154 @@
+from math import log10
+
+from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
+from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+from .commons import build_graph as _build_graph
+from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
+
+
+def _set_graph_edge_weights(graph):
+    for sentence_1 in graph.nodes():
+        for sentence_2 in graph.nodes():
+
+            edge = (sentence_1, sentence_2)
+            if sentence_1 != sentence_2 and not graph.has_edge(edge):
+                similarity = _get_similarity(sentence_1, sentence_2)
+                if similarity != 0:
+                    graph.add_edge(edge, similarity)
+
+    # Handles the case in which all similarities are zero.
+    # The resultant summary will consist of random sentences.
+    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
+        _create_valid_graph(graph)
+
+
+def _create_valid_graph(graph):
+    nodes = graph.nodes()
+
+    for i in range(len(nodes)):
+        for j in range(len(nodes)):
+            if i == j:
+                continue
+
+            edge = (nodes[i], nodes[j])
+
+            if graph.has_edge(edge):
+                graph.del_edge(edge)
+
+            graph.add_edge(edge, 1)
+
+
+def _get_similarity(s1, s2):
+    words_sentence_one = s1.split()
+    words_sentence_two = s2.split()
+
+    common_word_count = _count_common_words(words_sentence_one, words_sentence_two)
+
+    log_s1 = log10(len(words_sentence_one))
+    log_s2 = log10(len(words_sentence_two))
+
+    if log_s1 + log_s2 == 0:
+        return 0
+
+    return common_word_count / (log_s1 + log_s2)
+
+
+def _count_common_words(words_sentence_one, words_sentence_two):
+    return len(set(words_sentence_one) & set(words_sentence_two))
+
+
+def _format_results(extracted_sentences, split, score):
+    if score:
+        return [(sentence.text, sentence.score) for sentence in extracted_sentences]
+    if split:
+        return [sentence.text for sentence in extracted_sentences]
+    return "\n".join([sentence.text for sentence in extracted_sentences])
+
+
+def _add_scores_to_sentences(sentences, scores):
+    for sentence in sentences:
+        # Adds the score to the object if it has one.
+        if sentence.token in scores:
+            sentence.score = scores[sentence.token]
+        else:
+            sentence.score = 0
+
+
+def _get_sentences_with_word_count(sentences, words):
+    """ Given a list of sentences, returns a list of sentences with a
+    total word count similar to the word count provided.
+    """
+    word_count = 0
+    selected_sentences = []
+    # Loops until the word count is reached.
+    for sentence in sentences:
+        words_in_sentence = len(sentence.text.split())
+
+        # Checks if the inclusion of the sentence gives a better approximation
+        # to the word parameter.
+        if abs(words - word_count - words_in_sentence) > abs(words - word_count):
+            return selected_sentences
+
+        selected_sentences.append(sentence)
+        word_count += words_in_sentence
+
+    return selected_sentences
+
+
+def _extract_most_important_sentences(sentences, ratio, words):
+    sentences.sort(key=lambda s: s.score, reverse=True)
+
+    # If no "words" option is selected, the number of sentences is
+    # reduced by the provided ratio.
+    if words is None:
+        length = len(sentences) * ratio
+        return sentences[:int(length)]
+
+    # Else, the ratio is ignored.
+    else:
+        return _get_sentences_with_word_count(sentences, words)
+
+
+def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
+    if not isinstance(text, str):
+        raise ValueError("Text parameter must be a Unicode object (str)!")
+
+    # Gets a list of processed sentences.
+    sentences = _clean_text_by_sentences(text, language, additional_stopwords)
+
+    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
+    graph = _build_graph([sentence.token for sentence in sentences])
+    _set_graph_edge_weights(graph)
+
+    # Remove all nodes with all edges weights equal to zero.
+    _remove_unreachable_nodes(graph)
+
+    # PageRank cannot be run in an empty graph.
+    if len(graph.nodes()) == 0:
+        return [] if split else ""
+
+    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
+    pagerank_scores = _pagerank(graph)
+
+    # Adds the summa scores to the sentence objects.
+    _add_scores_to_sentences(sentences, pagerank_scores)
+
+    # EDIT: return the whole sentences with scores
+    return sentences
+
+    # Extracts the most important sentences with the selected criterion.
+    # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
+
+    # Sorts the extracted sentences by apparition order in the original text.
+    # extracted_sentences.sort(key=lambda s: s.index)
+
+    # return _format_results(extracted_sentences, split, scores)
+
+
+def get_graph(text, language="english"):
+    sentences = _clean_text_by_sentences(text, language)
+
+    graph = _build_graph([sentence.token for sentence in sentences])
+    _set_graph_edge_weights(graph)
+
+    return graph
--- a/summa/syntactic_unit.py
+++ b/summa/syntactic_unit.py
@ -0,0 +1,14 @@
+class SyntacticUnit(object):
+
+    def __init__(self, text, token=None, tag=None):
+        self.text = text
+        self.token = token
+        self.tag = tag[:2] if tag else None  # just first two letters of tag
+        self.index = -1
+        self.score = -1
+
+    def __str__(self):
+        return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
+
+    def __repr__(self):
+        return str(self)
--- a/summa/textrank.py
+++ b/summa/textrank.py
@ -0,0 +1,97 @@
+import argparse
+import os
+import sys
+import warnings
+
+from .summarizer import summarize
+from .keywords import keywords
+
+# Types of summarization
+SENTENCE = 0
+WORD = 1
+
+DEFAULT_RATIO = 0.2
+
+
+def textrank(text, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None):
+    if summarize_by == SENTENCE:
+        return summarize(text, ratio, words, additional_stopwords=additional_stopwords)
+    else:
+        return keywords(text, ratio, words, additional_stopwords=additional_stopwords)
+
+
+def existing_file(file_name):
+    try:
+        with open(file_name, 'r') as file:
+            return file.read()
+    except Exception:
+        raise argparse.ArgumentTypeError("The file provided could not be opened.")
+
+
+def restricted_float(x):
+    x = float(x)
+    if x < 0.0 or x > 1.0:
+        raise argparse.ArgumentTypeError("{} not in range [0.0, 1.0]".format(x))
+    return x
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="textrank", description="Extract the most relevant sentences or keywords of a given text using the TextRank algorithm.")
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    # New API
+    group.add_argument('--summarize', metavar="path/to/file", type=existing_file,
+                       help="Run textrank to summarize the input text.")
+    group.add_argument('--keywords', metavar="path/to/file", type=existing_file,
+                       help="Run textrank to extract keywords from the input text.")
+    # Old API
+    group.add_argument('--text', '-t', metavar="path/to/file", type=existing_file,
+                       help="(Deprecated) Text to summarize if --summary option is selected")
+
+    parser.add_argument('--summary', '-s', metavar="{0,1}", type=int, choices=[SENTENCE, WORD], default=0,
+                        help="(Deprecated) Type of unit to summarize: sentence (0) or word (1)")
+    parser.add_argument('--ratio', '-r', metavar="r", type=restricted_float, default=DEFAULT_RATIO,
+                        help="Float number (0,1] that defines the length of the summary. It's a proportion of the original text")
+    parser.add_argument('--words', '-w', metavar="#words", type=int,
+                        help="Number to limit the length of the summary. The length option is ignored if the word limit is set.")
+    parser.add_argument('--additional_stopwords', '-a', metavar="list,of,stopwords",
+                        help="Either a string of comma separated stopwords or a path to a file which has comma separated stopwords in every line")
+
+    return parser.parse_args(args)
+
+
+def main():
+    args = parse_args(sys.argv[1:])
+
+    mode = None
+    text = None
+
+    if args.summarize:
+        text = args.summarize
+        mode = SENTENCE
+    elif args.keywords:
+        text = args.keywords
+        mode = WORD
+    elif args.summary:  # Old api
+        warnings.warn("The --summary option is deprecated. Please use either --summarize or --keywords", DeprecationWarning)
+        text = args.text
+        mode = args.summary
+
+        if text is None:
+            raise argparse.ArgumentTypeError('Error: no text to summarize provided.')
+    else:
+        raise argparse.ArgumentTypeError('Error: --summarize or --keywords is required')
+
+    additional_stopwords = None
+    if args.additional_stopwords:
+        if os.path.exists(args.additional_stopwords):
+            with open(args.additional_stopwords) as f:
+                additional_stopwords = {s for l in f for s in l.strip().split(",")}
+        else:
+            additional_stopwords = args.additional_stopwords.split(",")
+
+    print(textrank(text, mode, args.ratio, args.words, additional_stopwords))
+
+
+if __name__ == "__main__":
+    main()
--- a/template.html
+++ b/template.html
@ -0,0 +1,31 @@
+<!DOCTYPE html>
+
+<html lang="fr">
+
+  <head>
+
+    <meta charset="UTF-8" />
+
+    <title>TextRank Opacity</title>
+    <meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
+
+    <link rel="stylesheet" type="text/css" href="css/main.css" />
+    <link rel="stylesheet" type="text/css" href="css/typography.css" />
+
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  </head>
+
+  <body>
+
+    <main>
+
+      {% for s in sentences %}
+        {{ s.html|safe }}
+      {% endfor %}
+
+    </main>
+
+  </body>
+
+</html>
--- a/texts/warehouse.txt
+++ b/texts/warehouse.txt
@ -0,0 +1 @@
+A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund.
--- a/www/css/main.css
+++ b/www/css/main.css
@ -0,0 +1,38 @@
+
+:root{
+  --lh: 1.35rem;
+}
+
+body{
+  margin: var(--lh);
+  line-height: var(--lh);
+}
+
+@media print{
+  body{
+    margin: 0;
+    font-size: 10pt;
+  }
+}
+
+main{
+  max-width: 42rem;
+  margin: 0 auto;
+}
+
+/* h1,h2,h3,h4,h5,h6{
+  line-height: var(--lh);
+} */
+
+h1{
+  text-align: center;
+  margin: calc(2 * var(--lh)) 0;
+}
+
+h2,h3,h4,h5,h6{
+  margin: calc(3 * var(--lh)) 0 var(--lh);
+}
+
+:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){
+  margin-top: var(--lh);
+}
--- a/www/index.html
+++ b/www/index.html
@ -0,0 +1,177 @@
+<!DOCTYPE html>
+
+<html lang="fr">
+
+  <head>
+
+    <meta charset="UTF-8" />
+
+    <title>TextRank Opacity</title>
+    <meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
+
+    <link rel="stylesheet" type="text/css" href="css/main.css" />
+    <link rel="stylesheet" type="text/css" href="css/typography.css" />
+
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  </head>
+
+  <body>
+
+    <main>
+
+      
+        <h1>Sambucus</h1>
+      
+        <span style="opacity:0.023354250368401927;">Sambucus is a genus of flowering plants in the family Adoxaceae.</span>
+      
+        <span style="opacity:0.008019401476129553;">The various species are commonly called elder or elderberry.</span>
+      
+        <span style="opacity:0.26031502027326375;">The genus was formerly placed in the honeysuckle family, Caprifoliaceae, but was reclassified as Adoxaceae due to genetic and morphological comparisons to plants in the genus Adoxa.</span>
+      
+        <h2 style="opacity:0.11901683057809066;">Description</h2>
+      
+        <span style="opacity:0.34993223091241904;">The oppositely arranged leaves are pinnate with 5–9 leaflets (or, rarely, 3 or 11).</span>
+      
+        <span style="opacity:0.6657790550844742;">Each leaf is 5–30 cm (2–12 in) long, and the leaflets have serrated margins.</span>
+      
+        <span style="opacity:0.15164144458890563;">They bear large clusters of small white or cream-colored flowers in late spring; these are followed by clusters of small black, blue-black, or red berries (rarely yellow or white).</span>
+      
+        <h3 style="opacity:0.037512192700824155;">Color</h3>
+      
+        <span style="opacity:0.002234682837227867;">Sambucus fruit is rich in anthocyanidins that combine to give elderberry juice an intense blue-purple coloration that turns reddish on dilution with water.</span>
+      
+        <span style="opacity:0.04596217339828307;">These pigments are used as colorants in various products, and "elderberry juice color" is listed by the US FDA as allowable in certified organic food products.</span>
+      
+        <span style="opacity:0.06433972186696155;">In Japan, elderberry juice is listed as an approved "natural color additive" under the Food and Sanitation Law. Fibers can be dyed with elderberry juice (using alum as a mordant) to give a light "elderberry" color.</span>
+      
+        <h3 style="opacity:0.005859165534145169;">Toxicity</h3>
+      
+        <span style="opacity:0.018703350472802083;">Although the cooked berries (pulp and skin) of most species of Sambucus are edible, the uncooked berries and other parts of plants from this genus are poisonous.</span>
+      
+        <span style="opacity:0.0;">Leaves, twigs, branches, seeds, roots, flowers, and berries of Sambucus plants produce cyanogenic glycosides, which have toxic properties.</span>
+      
+        <span style="opacity:0.0024907571958084017;">Ingesting a sufficient quantity of cyanogenic glycosides from berry juice, flower tea, or beverages made from fresh leaves, branches, and fruit has been shown to cause illness, including nausea, vomiting, abdominal cramps, diarrhea, and weakness.</span>
+      
+        <span style="opacity:0.004068285156595224;">In August 1983, a group of 25 people in Monterey County, California, became suddenly ill by ingesting elderberry juice pressed from fresh, uncooked Sambucus mexicana berries, leaves, and stems.</span>
+      
+        <span style="opacity:0.004033434845520135;">The density of cyanogenic glycosides is higher in tea made from flowers (or leaves) than from the berries.The seeds of Sambucus callicarpa are reported to be poisonous and may cause vomiting or diarrhea.</span>
+      
+        <h2 style="opacity:0.3047373240294963;">Taxonomy</h2>
+      
+        <span style="opacity:0.1435576912325227;">The taxonomy of the genus Sambucus L., originally described by Carl Linnaeus and hence its botanical authority, has been complicated by its wide geographical distribution and morphological diversity.</span>
+      
+        <span style="opacity:0.4903541107710174;">This has led to overdescription of the species and infraspecific taxa (subspecies, varieties or forms).</span>
+      
+        <span style="opacity:0.28030017008494884;">The name comes from the Greek word sambuce, an ancient wind instrument, about the removal of pith from the twigs to make whistles.Species recognized in this genus are:</span>
+      
+        <h2 style="opacity:0.3774205134479782;">Distribution and habitat</h2>
+      
+        <span style="opacity:0.5170795700859395;">The genus occurs in temperate to subtropical regions of the world.</span>
+      
+        <span style="opacity:0.7289657600921431;">More widespread in the Northern Hemisphere, its Southern Hemisphere occurrence is restricted to parts of Australasia and South America.</span>
+      
+        <span style="opacity:0.004931060704500269;">Many species are widely cultivated for their ornamental leaves, flowers, and fruit.</span>
+      
+        <h3 style="opacity:0.34774180081331607;">Habitat</h3>
+      
+        <span style="opacity:0.23667717197372362;">Elder commonly grows near farms and homesteads.</span>
+      
+        <span style="opacity:0.3192589413693254;">It is a nitrogen-dependent plant and thus is generally found near places of organic waste disposal.</span>
+      
+        <span style="opacity:0.20272560262481226;">Elders are often grown as a hedgerow plant in Britain since they take very fast, can be bent into shape easily, and grow quite profusely, thus having gained the reputation of being 'an instant hedge'.</span>
+      
+        <span style="opacity:0.632305487285403;">It is not generally affected by soil type or pH level and will virtually grow anywhere sufficient sunlight is available.</span>
+      
+        <h2 style="opacity:0.36011640223198155;">Ecology</h2>
+      
+        <span style="opacity:0.09934447961441183;">In Northern California, elderberries are a food for migrating band-tailed pigeons.</span>
+      
+        <span style="opacity:0.10833689405967695;">Elders are used as food plants by the larvae of some Lepidoptera species including brown-tail, buff ermine, dot moth, emperor moth, engrailed moth, swallow-tailed moth and the V-pug.</span>
+      
+        <span style="opacity:0.40509224339149436;">The crushed foliage and immature fruit have a strong fetid smell.</span>
+      
+        <span style="opacity:0.16493413805985815;">Valley elderberry longhorn beetles in California are very often found around red or blue elderberry bushes.</span>
+      
+        <span style="opacity:1;">Females lay their eggs on the bark.</span>
+      
+        <span style="opacity:0.38299065826644807;">The pith of elder has been used by watchmakers for cleaning tools before intricate work.</span>
+      
+        <h2 style="opacity:0.4959298303208725;">Cultivation</h2>
+      
+        <span style="opacity:0.0023273321750337233;">Traditional uses of Sambucus involved berries, seeds, leaves, and flowers or component extracts.</span>
+      
+        <span style="opacity:0.021538616933372428;">Ornamental varieties of Sambucus are grown in gardens for their showy flowers, fruits and lacy foliage which support habitat for wildlife.</span>
+      
+        <span style="opacity:0.37967191922582566;">Of the many native species, three are used as ornamentals, S.</span>
+      
+        <span style="opacity:0.5720411135910031;">nigra, S.</span>
+      
+        <span style="opacity:1;">canadensis and S.</span>
+      
+        <span style="opacity:1;">racemosa.</span>
+      
+        <h2 style="opacity:0.26037935627574993;">Uses</h2>
+      
+        <h3 style="opacity:0.17679277695747428;">Nutrition</h3>
+      
+        <span style="opacity:0.16562834706461427;">Raw elderberries are 80% water, 18% carbohydrates, and less than 1% each of protein and fat (table).</span>
+      
+        <span style="opacity:0.1368265507355418;">In a 100-gram (3+1⁄2 oz) amount, elderberries supply 305 kilojoules (73 kcal) of food energy and are a rich source of vitamin C, providing 43% of the Daily Value (DV).</span>
+      
+        <span style="opacity:0.2279234330722667;">Elderberries also have moderate contents of vitamin B6 (18% DV) and iron (12% DV), with no other nutrients in significant content.</span>
+      
+        <h3 style="opacity:0.2385522392706748;">Dietary supplement</h3>
+      
+        <span style="opacity:0.007324180292626893;">Elderberry fruit or flowers are used as dietary supplements to prevent or provide relief from minor diseases, such as flu, colds, constipation, and other conditions, served as a tea, extract or in a capsule.</span>
+      
+        <span style="opacity:0.07793442880325537;">The use of elderberry supplements increased early in the COVID-19 pandemic.</span>
+      
+        <span style="opacity:0.5690343662424139;">There is insufficient research to establish its effectiveness for such uses, or its safety profile.</span>
+      
+        <span style="opacity:0.3046053054729051;">The raw or unripe fruit of S.</span>
+      
+        <span style="opacity:0.23386291554217284;">nigra or its extracts may contain a cyanogenic glycoside that is potentially toxic.</span>
+      
+        <h3 style="opacity:0.30036772549577245;">Traditional medicine</h3>
+      
+        <span style="opacity:0.12635509010408633;">Although practitioners of traditional medicine have used elderberry over centuries, there is no high-quality clinical evidence that such practices provide any benefit.</span>
+      
+        <span style="opacity:0.04619027362780958;">The flowers of Sambucus nigra are used to produce elderflower cordial.</span>
+      
+        <span style="opacity:0.5605661316829006;">St-Germain, a French liqueur, is made from elderflowers.</span>
+      
+        <span style="opacity:0.6249597590948577;">Hallands Fläder, a Swedish akvavit, is flavoured with elderflowers.</span>
+      
+        <span style="opacity:0.058473741273256635;">Hollowed elderberry twigs have traditionally been used as spiles to tap maple trees for syrup.</span>
+      
+        <span style="opacity:0.36742927908821876;">Additionally, they have been hollowed out and used as flutes, blowguns, and syringes.The fruit of S.</span>
+      
+        <span style="opacity:0.7665194357270494;">callicarpa is eaten by birds and mammals.</span>
+      
+        <span style="opacity:0.04897270752302531;">It is inedible to humans when raw but can be made into wine.Elderberry twigs and fruit are employed in creating dyes for basketry.</span>
+      
+        <span style="opacity:0.10384311134074758;">These stems are dyed a very deep black by soaking them in a wash made from the berry stems of the elderberry.</span>
+      
+        <h2 style="opacity:0.1940215897275913;">In popular culture</h2>
+      
+        <span style="opacity:0.26902332053709666;">Folklore related to elder trees is extensive and can vary according to region.</span>
+      
+        <span style="opacity:0.042182724177350944;">In some traditions, the elder tree is thought to ward off evil and give protection from witches, while other beliefs say that witches often congregate under the plant, especially when it is full of fruit.</span>
+      
+        <span style="opacity:0.24981940690065454;">If an elder tree was cut down, a spirit known as the Elder Mother would be released and take her revenge.</span>
+      
+        <span style="opacity:0.18680917834736657;">The tree could only safely be cut while chanting a rhyme to the Elder Mother.Made from the branch of an elder tree, the Elder Wand plays a pivotal role in the final book of the Harry Potter series, which was nearly named Harry Potter and the Elder Wand before author J.</span>
+      
+        <span style="opacity:0.22868031954081833;">K. Rowling decided on Harry Potter and the Deathly Hallows.Elton John's 1973 album Don't Shoot Me I'm Only the Piano Player features a song titled "Elderberry Wine".</span>
+      
+        <span style="opacity:0.1876145888622608;">In Monty Python and the Holy Grail, John Cleese as the French Taunter tells the knights of Camelot, "Your mother was a hamster, and your father smelt of elderberries."</span>
+      
+        <h2 style="opacity:NaN;">Gallery</h2>
+      
+
+    </main>
+
+  </body>
+
+</html>
				`@ -0,0 +1 @@`
				A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund.