first experiment with opacity

master
Dorian 2 years ago
commit b8343c650f

@ -0,0 +1,4 @@
opacity experiment using:
* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score.
* wikipedia python module (https://pypi.org/project/wikipedia/)

@ -0,0 +1,233 @@
from jinja2 import Template
import os
import wikipedia
from markdown import markdown
# importing module
import sys
# appending a path
# sys.path.append('textrank')
# importing required module
import summa.summarizer
from summa.summarizer import summarize
# TODO:
# * DONE: wiki header
# those 3 would ask to start from the HTML itself and keep and index...
# * wiki paragraph
# * wiki hyperlinks
# * list
# variables
# ------------------------------------------------------------------------
# wikipedia_page = "forest"
# wikipedia_page = "warehouse"
# wikipedia_page = "river"
wikipedia_page = "elderflower"
# wikipedia_page = "mushroom"
TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'
# utilities
# ------------------------------------------------------------------------
def map_value(value, min, max, new_min, new_max):
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
def remap_score(s, min_score, max_score):
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
return s
def compress_score(s):
# compress whites
s.score = s.score**3
# stretch + limiter
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
s.score = 1 if s.score > 0.8 else s.score
return s
# wikipedia
# ------------------------------------------------------------------------
def wikipage(pagename):
# get wikipedia page content by name of the page
print(pagename)
wikipedia.set_lang("en")
try:
results = wikipedia.search(pagename, results=1, suggestion=False)
try:
pagename = results[0]
except IndexError:
# if there is no suggestion or search results, the page doesn't exist
raise wikipedia.PageError(pagename)
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
except wikipedia.exceptions.DisambiguationError as e:
print(e.options)
page = ''
return page
# parsing and gluing html
# ------------------------------------------------------------------------
def is_header(s):
# i is the header level
i = 0
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
i += 1
if i > 0:
header_text = s.text[i:(-1-i)].strip()
header_level = i
return [header_text, header_level]
def wiki_parse(sentences):
# TODO: doesn't work with section nesting!!
# 1. replace wikitext header with html header
# 2. add the opacity to each elements
# 3. compute an artificial score for header that is an average of the score of the section
new_sentences = []
print('--- HEADERS ---')
for i in range(len(sentences)):
s = sentences[i]
# if sentences is header
header = is_header(s)
if header:
print(header[0])
# start computing the average of score of this section
current_total = 0
current_count = 0
next_header_found = False
j = i + 1
# iterating while we find next header with greatest or same level
while j < len(sentences) and not next_header_found:
s2 = sentences[j]
s2_header = is_header(s2)
if s2_header:
print(' ' + s2_header[0])
if header[1] >= s2_header[1]:
# encounter header of higher level
next_header_found = True
print('X ' + s2_header[0])
else:
# adding every sentence to the average
current_total += s2.score
current_count += 1
j += 1
if current_count != 0:
s.score = current_total / current_count
else:
s.score = "NaN"
s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
# stops at the references part
if header[0] == "References" or header[0] == "See also":
break
new_sentences.append(s)
# not a header
else:
s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
new_sentences.append(s)
return new_sentences
# textrank
# ------------------------------------------------------------------------
def txt2rankedsentences(txt):
# from txt to ranked sentences
return summarize(txt, split=True)
# main
# ------------------------------------------------------------------------
if __name__ == '__main__':
# --- WIKI REQUEST ---
# get text from wikipedia
print('--- WIKI ---')
page = wikipage(wikipedia_page)
if not page:
sys.exit("--- STOP ---")
title = '<h1>'+page.title+'</h1>'
text = page.content
# print text in terminal
print('--- TXT ---')
print(text)
# --- APPLY TEXTRANK ---
# apply textrank
sentences = txt2rankedsentences(text)
# print ranked sentences in terminal
print('--- SENTENCES ---')
for s in sentences:
print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
# --- REMAP AND COMPRESS ---
# sorted version of the list
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
# remap sentences from 0 to 1
max_score = sorted_sentences[0].score
min_score = sorted_sentences[-1].score
sentences = [remap_score(s, min_score, max_score) for s in sentences]
# compress scores (make more stuff invisible)
sentences = [compress_score(s) for s in sentences]
# -- PARSE ---
# parse every sentences to either span or header
sentences = wiki_parse(sentences)
# add back page title
sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
# -- TEMPLATING ---
# getting the template
with open(TEMPLATE_PATH, 'r') as file:
template = Template(file.read())
# render template
html = template.render(sentences = sentences)
with open(HTML_PATH, 'w') as file:
file.write(html)

@ -0,0 +1,2 @@
from summa import commons, graph, keywords, pagerank_weighted, \
summarizer, syntactic_unit, textrank

@ -0,0 +1,15 @@
from .graph import Graph
def build_graph(sequence):
graph = Graph()
for item in sequence:
if not graph.has_node(item):
graph.add_node(item)
return graph
def remove_unreachable_nodes(graph):
for node in graph.nodes():
if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
graph.del_node(node)

@ -0,0 +1,2 @@
class TextrankRuntimeError(RuntimeError):
pass

@ -0,0 +1,244 @@
from abc import ABCMeta, abstractmethod
class IGraph(metaclass=ABCMeta):
"""
Represents the interface or contract that the graph for TextRank should implement
"""
@abstractmethod
def nodes(self):
"""
Return node list.
@rtype: list
@return: Node list.
"""
pass
@abstractmethod
def edges(self):
"""
Return all edges in the graph.
@rtype: list
@return: List of all edges in the graph.
"""
pass
@abstractmethod
def neighbors(self, node):
"""
Return all nodes that are directly accessible from given node.
@type node: node
@param node: Node identifier
@rtype: list
@return: List of nodes directly accessible from given node.
"""
pass
@abstractmethod
def has_node(self, node):
"""
Return whether the requested node exists.
@type node: node
@param node: Node identifier
@rtype: boolean
@return: Truth-value for node existence.
"""
pass
@abstractmethod
def add_node(self, node, attrs=None):
"""
Add given node to the graph.
@attention: While nodes can be of any type, it's strongly recommended to use only
numbers and single-line strings as node identifiers if you intend to use write().
@type node: node
@param node: Node identifier.
@type attrs: list
@param attrs: List of node attributes specified as (attribute, value) tuples.
"""
pass
@abstractmethod
def add_edge(self, edge, wt=1, label='', attrs=[]):
"""
Add an edge to the graph connecting two nodes.
An edge, here, is a pair of nodes like C{(n, m)}.
@type edge: tuple
@param edge: Edge.
@type wt: number
@param wt: Edge weight.
@type label: string
@param label: Edge label.
@type attrs: list
@param attrs: List of node attributes specified as (attribute, value) tuples.
"""
pass
@abstractmethod
def has_edge(self, edge):
"""
Return whether an edge exists.
@type edge: tuple
@param edge: Edge.
@rtype: boolean
@return: Truth-value for edge existence.
"""
pass
@abstractmethod
def edge_weight(self, edge):
"""
Get the weight of an edge.
@type edge: edge
@param edge: One edge.
@rtype: number
@return: Edge weight.
"""
pass
@abstractmethod
def del_node(self, node):
"""
Remove a node from the graph.
@type node: node
@param node: Node identifier.
"""
pass
class Graph(IGraph):
"""
Implementation of an undirected graph, based on Pygraph
"""
WEIGHT_ATTRIBUTE_NAME = "weight"
DEFAULT_WEIGHT = 0
LABEL_ATTRIBUTE_NAME = "label"
DEFAULT_LABEL = ""
def __init__(self):
# Metadata about edges
self.edge_properties = {} # Mapping: Edge -> Dict mapping, lablel-> str, wt->num
self.edge_attr = {} # Key value pairs: (Edge -> Attributes)
# Metadata about nodes
self.node_attr = {} # Pairing: Node -> Attributes
self.node_neighbors = {} # Pairing: Node -> Neighbors
def has_edge(self, edge):
u,v = edge
return (u,v) in self.edge_properties and (v,u) in self.edge_properties
def edge_weight(self, edge):
return self.get_edge_properties( edge ).setdefault( self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT )
def neighbors(self, node):
return self.node_neighbors[node]
def has_node(self, node):
return node in self.node_neighbors
def add_edge(self, edge, wt=1, label='', attrs=[]):
u, v = edge
if (v not in self.node_neighbors[u] and u not in self.node_neighbors[v]):
self.node_neighbors[u].append(v)
if (u != v):
self.node_neighbors[v].append(u)
self.add_edge_attributes((u,v), attrs)
self.set_edge_properties((u, v), label=label, weight=wt)
else:
raise ValueError("Edge (%s, %s) already in graph" % (u, v))
def add_node(self, node, attrs=None):
if attrs is None:
attrs = []
if (not node in self.node_neighbors):
self.node_neighbors[node] = []
self.node_attr[node] = attrs
else:
raise ValueError("Node %s already in graph" % node)
def nodes(self):
return list(self.node_neighbors.keys())
def edges(self):
return [ a for a in list(self.edge_properties.keys()) ]
def del_node(self, node):
for each in list(self.neighbors(node)):
if (each != node):
self.del_edge((each, node))
del(self.node_neighbors[node])
del(self.node_attr[node])
# Helper methods
def get_edge_properties(self, edge):
return self.edge_properties.setdefault( edge, {} )
def add_edge_attributes(self, edge, attrs):
for attr in attrs:
self.add_edge_attribute(edge, attr)
def add_edge_attribute(self, edge, attr):
self.edge_attr[edge] = self.edge_attributes(edge) + [attr]
if (edge[0] != edge[1]):
self.edge_attr[(edge[1],edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr]
def edge_attributes(self, edge):
try:
return self.edge_attr[edge]
except KeyError:
return []
def set_edge_properties(self, edge, **properties ):
self.edge_properties.setdefault( edge, {} ).update( properties )
if (edge[0] != edge[1]):
self.edge_properties.setdefault((edge[1], edge[0]), {}).update( properties )
def del_edge(self, edge):
u, v = edge
self.node_neighbors[u].remove(v)
self.del_edge_labeling((u, v))
if (u != v):
self.node_neighbors[v].remove(u)
self.del_edge_labeling((v, u)) # TODO: This is redundant
def del_edge_labeling( self, edge ):
keys = [edge]
keys.append(edge[::-1])
for key in keys:
for mapping in [self.edge_properties, self.edge_attr ]:
try:
del ( mapping[key] )
except KeyError:
pass

@ -0,0 +1,227 @@
from itertools import combinations as _combinations
from queue import Queue
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
WINDOW_SIZE = 2
"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
Example: filter for nouns and adjectives:
INCLUDING_FILTER = ['NN', 'JJ']"""
INCLUDING_FILTER = ['NN', 'JJ']
EXCLUDING_FILTER = []
def _get_pos_filters():
return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
def _get_words_for_graph(tokens):
include_filters, exclude_filters = _get_pos_filters()
if include_filters and exclude_filters:
raise ValueError("Can't use both include and exclude filters, should use only one")
result = []
for word, unit in tokens.items():
if exclude_filters and unit.tag in exclude_filters:
continue
if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
result.append(unit.token)
return result
def _get_first_window(split_text):
return split_text[:WINDOW_SIZE]
def _set_graph_edge(graph, tokens, word_a, word_b):
if word_a in tokens and word_b in tokens:
lemma_a = tokens[word_a].token
lemma_b = tokens[word_b].token
edge = (lemma_a, lemma_b)
if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
graph.add_edge(edge)
def _process_first_window(graph, tokens, split_text):
first_window = _get_first_window(split_text)
for word_a, word_b in _combinations(first_window, 2):
_set_graph_edge(graph, tokens, word_a, word_b)
def _init_queue(split_text):
queue = Queue()
first_window = _get_first_window(split_text)
for word in first_window[1:]:
queue.put(word)
return queue
def _process_word(graph, tokens, queue, word):
for word_to_compare in _queue_iterator(queue):
_set_graph_edge(graph, tokens, word, word_to_compare)
def _update_queue(queue, word):
queue.get()
queue.put(word)
assert queue.qsize() == (WINDOW_SIZE - 1)
def _process_text(graph, tokens, split_text):
queue = _init_queue(split_text)
for i in range(WINDOW_SIZE, len(split_text)):
word = split_text[i]
_process_word(graph, tokens, queue, word)
_update_queue(queue, word)
def _queue_iterator(queue):
iterations = queue.qsize()
for i in range(iterations):
var = queue.get()
yield var
queue.put(var)
def _set_graph_edges(graph, tokens, split_text):
_process_first_window(graph, tokens, split_text)
_process_text(graph, tokens, split_text)
def _extract_tokens(lemmas, scores, ratio, words):
lemmas.sort(key=lambda s: scores[s], reverse=True)
# If no "words" option is selected, the number of sentences is
# reduced by the provided ratio, else, the ratio is ignored.
length = len(lemmas) * ratio if words is None else words
return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]
def _lemmas_to_words(tokens):
lemma_to_word = {}
for word, unit in tokens.items():
lemma = unit.token
if lemma in lemma_to_word:
lemma_to_word[lemma].append(word)
else:
lemma_to_word[lemma] = [word]
return lemma_to_word
def _get_keywords_with_score(extracted_lemmas, lemma_to_word):
"""
:param extracted_lemmas:list of tuples
:param lemma_to_word: dict of {lemma:list of words}
:return: dict of {keyword:score}
"""
keywords = {}
for score, lemma in extracted_lemmas:
keyword_list = lemma_to_word[lemma]
for keyword in keyword_list:
keywords[keyword] = score
return keywords
def _strip_word(word):
stripped_word_list = list(_tokenize_by_word(word))
return stripped_word_list[0] if stripped_word_list else ""
def _get_combined_keywords(_keywords, split_text):
"""
:param keywords:dict of keywords:scores
:param split_text: list of strings
:return: combined_keywords:list
"""
result = []
_keywords = _keywords.copy()
len_text = len(split_text)
for i in range(len_text):
word = _strip_word(split_text[i])
if word in _keywords:
combined_word = [word]
if i + 1 == len_text:
result.append(word) # appends last word if keyword and doesn't iterate
for j in range(i + 1, len_text):
other_word = _strip_word(split_text[j])
if other_word in _keywords and other_word == split_text[j] \
and other_word not in combined_word:
combined_word.append(other_word)
else:
for keyword in combined_word:
_keywords.pop(keyword)
result.append(" ".join(combined_word))
break
return result
def _get_average_score(concept, _keywords):
word_list = concept.split()
word_counter = 0
total = 0
for word in word_list:
total += _keywords[word]
word_counter += 1
return total / word_counter
def _format_results(_keywords, combined_keywords, split, scores):
"""
:param keywords:dict of keywords:scores
:param combined_keywords:list of word/s
"""
combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
if scores:
return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
if split:
return combined_keywords
return "\n".join(combined_keywords)
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False, deaccent=False, additional_stopwords=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a dict of word -> lemma
tokens = _clean_text_by_word(text, language, deacc=deaccent, additional_stopwords=additional_stopwords)
split_text = list(_tokenize_by_word(text))
# Creates the graph and adds the edges
graph = _build_graph(_get_words_for_graph(tokens))
_set_graph_edges(graph, tokens, split_text)
del split_text # It's no longer used
_remove_unreachable_nodes(graph)
# PageRank cannot be run in an empty graph.
if len(graph.nodes()) == 0:
return [] if split else ""
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
pagerank_scores = _pagerank(graph)
extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
lemmas_to_word = _lemmas_to_words(tokens)
keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
combined_keywords = _get_combined_keywords(keywords, text.split())
return _format_results(keywords, combined_keywords, split, scores)
def get_graph(text, language="english", deaccent=False):
tokens = _clean_text_by_word(text, language, deacc=deaccent)
split_text = list(_tokenize_by_word(text, deacc=deaccent))
graph = _build_graph(_get_words_for_graph(tokens))
_set_graph_edges(graph, tokens, split_text)
return graph

@ -0,0 +1,86 @@
from scipy.sparse import csr_matrix
from scipy.linalg import eig
from numpy import empty as empty_matrix
CONVERGENCE_THRESHOLD = 0.0001
def pagerank_weighted(graph, initial_value=None, damping=0.85):
"""Calculates PageRank for an undirected graph"""
if initial_value == None: initial_value = 1.0 / len(graph.nodes())
scores = dict.fromkeys(graph.nodes(), initial_value)
iteration_quantity = 0
for iteration_number in range(100):
iteration_quantity += 1
convergence_achieved = 0
for i in graph.nodes():
rank = 1 - damping
for j in graph.neighbors(i):
neighbors_sum = sum(graph.edge_weight((j, k)) for k in graph.neighbors(j))
rank += damping * scores[j] * graph.edge_weight((j, i)) / neighbors_sum
if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
convergence_achieved += 1
scores[i] = rank
if convergence_achieved == len(graph.nodes()):
break
return scores
def pagerank_weighted_scipy(graph, damping=0.85):
adjacency_matrix = build_adjacency_matrix(graph)
probability_matrix = build_probability_matrix(graph)
# Suppress deprecation warnings from numpy.
# See https://github.com/summanlp/textrank/issues/57
import warnings
with warnings.catch_warnings():
from numpy import VisibleDeprecationWarning
warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
vals, vecs = eig(pagerank_matrix, left=True, right=False)
return process_results(graph, vecs)
def build_adjacency_matrix(graph):
row = []
col = []
data = []
nodes = graph.nodes()
length = len(nodes)
for i in range(length):
current_node = nodes[i]
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
for j in range(length):
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
if i != j and edge_weight != 0:
row.append(i)
col.append(j)
data.append(edge_weight / neighbors_sum)
return csr_matrix((data,(row,col)), shape=(length,length))
def build_probability_matrix(graph):
dimension = len(graph.nodes())
matrix = empty_matrix((dimension,dimension))
probability = 1 / float(dimension)
matrix.fill(probability)
return matrix
def process_results(graph, vecs):
scores = {}
for i, node in enumerate(graph.nodes()):
scores[node] = abs(vecs[i][0])
return scores

@ -0,0 +1,635 @@
# Adapted from the NLTK package v3.0.1:
# https://github.com/nltk/nltk/blob/3.0.1/nltk/stem/porter.py
# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
#
# This software is maintained by Vivake (vivakeATomniscia.org) and is available at:
# http://www.omniscia.org/~vivake/python/PorterStemmer.py
#
# Additional modifications were made to incorporate this module into
# NLTK. All such modifications are marked with "--NLTK--". The NLTK
# version of this module is maintained by NLTK developers,
# and is available via http://nltk.org/
#
# GNU Linking Exception:
# Using this module statically or dynamically with other modules is
# making a combined work based on this module. Thus, the terms and
# conditions of the GNU General Public License cover the whole combination.
# As a special exception, the copyright holders of this module give
# you permission to combine this module with independent modules to
# produce an executable program, regardless of the license terms of these
# independent modules, and to copy and distribute the resulting
# program under terms of your choice, provided that you also meet,
# for each linked independent module, the terms and conditions of
# the license of that module. An independent module is a module which
# is not derived from or based on this module. If you modify this module,
# you may extend this exception to your version of the module, but you
# are not obliged to do so. If you do not wish to do so, delete this
# exception statement from your version.
"""
Porter Stemmer
This is the Porter stemming algorithm, ported to Python from the
version coded up in ANSI C by the author. It follows the algorithm
presented in
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
only differing from it at the points marked --DEPARTURE-- and --NEW--
below.
For a more faithful version of the Porter algorithm, see
http://www.tartarus.org/~martin/PorterStemmer/
Later additions:
June 2000
The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
This follows a suggestion of Barry Wilkins, research student at
Birmingham.
February 2000
the cvc test for not dropping final -e now looks after vc at the
beginning of a word, so are, eve, ice, ore, use keep final -e. In this
test c is any consonant, including w, x and y. This extension was
suggested by Chris Emerson.
-fully -> -ful treated like -fulness -> -ful, and
-tionally -> -tion treated like -tional -> -tion
both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
Additional modifications were made to incorperate this module into
nltk. All such modifications are marked with \"--NLTK--\". The nltk
version of this module is maintained by the NLTK developers, and is
available from <http://nltk.sourceforge.net>
"""
## --NLTK--
## Declare this module's documentation format.
class PorterStemmer():
## --NLTK--
## Add a module docstring
"""
A word stemmer based on the Porter stemming algorithm.
Porter, M. \"An algorithm for suffix stripping.\"
Program 14.3 (1980): 130-137.
A few minor modifications have been made to Porter's basic
algorithm. See the source code of this module for more
information.
The Porter Stemmer requires that all tokens have string types.
"""
# The main part of the stemming algorithm starts here.
# Note that only lower case sequences are stemmed. Forcing to lower case
# should be done before stem(...) is called.
def __init__(self):
## --NEW--
## This is a table of irregular forms. It is quite short, but still
## reflects the errors actually drawn to Martin Porter's attention over
## a 20 year period!
##
## Extend it as necessary.
##
## The form of the table is:
## {
## "p1" : ["s11","s12","s13", ... ],
## "p2" : ["s21","s22","s23", ... ],
## ...
## "pn" : ["sn1","sn2","sn3", ... ]
## }
##
## String sij is mapped to paradigm form pi, and the main stemming
## process is then bypassed.
irregular_forms = {
"sky" : ["sky", "skies"],
"die" : ["dying"],
"lie" : ["lying"],
"tie" : ["tying"],
"news" : ["news"],
"inning" : ["innings", "inning"],
"outing" : ["outings", "outing"],
"canning" : ["cannings", "canning"],
"howe" : ["howe"],
# --NEW--
"proceed" : ["proceed"],
"exceed" : ["exceed"],
"succeed" : ["succeed"], # Hiranmay Ghosh
}
self.pool = {}
for key in irregular_forms:
for val in irregular_forms[key]:
self.pool[val] = key
self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
def _cons(self, word, i):
"""cons(i) is TRUE <=> b[i] is a consonant."""
if word[i] in self.vowels:
return False
if word[i] == 'y':
if i == 0:
return True
else:
return (not self._cons(word, i - 1))
return True
def _m(self, word, j):
"""m() measures the number of consonant sequences between k0 and j.
if c is a consonant sequence and v a vowel sequence, and <..>
indicates arbitrary presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
"""
n = 0
i = 0
while True:
if i > j:
return n
if not self._cons(word, i):
break
i = i + 1
i = i + 1
while True:
while True:
if i > j:
return n
if self._cons(word, i):
break
i = i + 1
i = i + 1
n = n + 1
while True:
if i > j:
return n
if not self._cons(word, i):
break
i = i + 1
i = i + 1
def _vowelinstem(self, stem):
"""vowelinstem(stem) is TRUE <=> stem contains a vowel"""
for i in range(len(stem)):
if not self._cons(stem, i):
return True
return False
def _doublec(self, word):
"""doublec(word) is TRUE <=> word ends with a double consonant"""
if len(word) < 2:
return False
if (word[-1] != word[-2]):
return False
return self._cons(word, len(word)-1)
def _cvc(self, word, i):
"""cvc(i) is TRUE <=>
a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or
b) word[i - 2], word[i - 1], word[i] has the form consonant -
vowel - consonant and also if the second c is not w, x or y. this
is used when trying to restore an e at the end of a short word.
e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
"""
if i == 0: return False # i == 0 never happens perhaps
if i == 1: return (not self._cons(word, 0) and self._cons(word, 1))
if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False
ch = word[i]
if ch == 'w' or ch == 'x' or ch == 'y':
return False
return True
def _step1ab(self, word):
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
sties -> sti
tie -> tie (--NEW--: see below)
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
"""
if word[-1] == 's':
if word.endswith("sses"):
word = word[:-2]
elif word.endswith("ies"):
if len(word) == 4:
word = word[:-1]
# this line extends the original algorithm, so that
# 'flies'->'fli' but 'dies'->'die' etc
else:
word = word[:-2]
elif word[-2] != 's':
word = word[:-1]
ed_or_ing_trimmed = False
if word.endswith("ied"):
if len(word) == 4:
word = word[:-1]
else:
word = word[:-2]
# this line extends the original algorithm, so that
# 'spied'->'spi' but 'died'->'die' etc
elif word.endswith("eed"):
if self._m(word, len(word)-4) > 0:
word = word[:-1]
elif word.endswith("ed") and self._vowelinstem(word[:-2]):
word = word[:-2]
ed_or_ing_trimmed = True
elif word.endswith("ing") and self._vowelinstem(word[:-3]):
word = word[:-3]
ed_or_ing_trimmed = True
if ed_or_ing_trimmed:
if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
word += 'e'
elif self._doublec(word):
if word[-1] not in ['l', 's', 'z']:
word = word[:-1]
elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)):
word += 'e'
return word
def _step1c(self, word):
"""step1c() turns terminal y to i when there is another vowel in the stem.
--NEW--: This has been modified from the original Porter algorithm so that y->i
is only done when y is preceded by a consonant, but not if the stem
is only a single consonant, i.e.
(*c and not c) Y -> I
So 'happy' -> 'happi', but
'enjoy' -> 'enjoy' etc
This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
'enjoy'. Step 1c is perhaps done too soon; but with this modification that
no longer really matters.
Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
'flies' ...
"""
if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2):
return word[:-1] + 'i'
else:
return word
def _step2(self, word):
"""step2() maps double suffices to single ones.
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
string before the suffix must give m() > 0.
"""
if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
return word
ch = word[-2]
if ch == 'a':
if word.endswith("ational"):
return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word
elif word.endswith("tional"):
return word[:-2] if self._m(word, len(word)-7) > 0 else word
else:
return word
elif ch == 'c':
if word.endswith("enci"):
return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word
elif word.endswith("anci"):
return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word
else:
return word
elif ch == 'e':
if word.endswith("izer"):
return word[:-1] if self._m(word, len(word)-5) > 0 else word
else:
return word
elif ch == 'l':
if word.endswith("bli"):
return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE--
# To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
elif word.endswith("alli"):
# --NEW--
if self._m(word, len(word)-5) > 0:
word = word[:-2]
return self._step2(word)
else:
return word
elif word.endswith("fulli"):
return word[:-2] if self._m(word, len(word)-6) else word # --NEW--
elif word.endswith("entli"):
return word[:-2] if self._m(word, len(word)-6) else word
elif word.endswith("eli"):
return word[:-2] if self._m(word, len(word)-4) else word
elif word.endswith("ousli"):
return word[:-2] if self._m(word, len(word)-6) else word
else:
return word
elif ch == 'o':
if word.endswith("ization"):
return word[:-7] + "ize" if self._m(word, len(word)-8) else word
elif word.endswith("ation"):
return word[:-5] + "ate" if self._m(word, len(word)-6) else word
elif word.endswith("ator"):
return word[:-4] + "ate" if self._m(word, len(word)-5) else word
else:
return word
elif ch == 's':
if word.endswith("alism"):
return word[:-3] if self._m(word, len(word)-6) else word
elif word.endswith("ness"):
if word.endswith("iveness"):
return word[:-4] if self._m(word, len(word)-8) else word
elif word.endswith("fulness"):
return word[:-4] if self._m(word, len(word)-8) else word
elif word.endswith("ousness"):
return word[:-4] if self._m(word, len(word)-8) else word
else:
return word
else:
return word
elif ch == 't':
if word.endswith("aliti"):
return word[:-3] if self._m(word, len(word)-6) else word
elif word.endswith("iviti"):
return word[:-5] + "ive" if self._m(word, len(word)-6) else word
elif word.endswith("biliti"):
return word[:-6] + "ble" if self._m(word, len(word)-7) else word
else:
return word
elif ch == 'g': # --DEPARTURE--
if word.endswith("logi"):
return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins)
# To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
else:
return word
else:
return word
def _step3(self, word):
"""step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
ch = word[-1]
if ch == 'e':
if word.endswith("icate"):
return word[:-3] if self._m(word, len(word)-6) else word
elif word.endswith("ative"):
return word[:-5] if self._m(word, len(word)-6) else word
elif word.endswith("alize"):
return word[:-3] if self._m(word, len(word)-6) else word
else:
return word
elif ch == 'i':
if word.endswith("iciti"):
return word[:-3] if self._m(word, len(word)-6) else word
else:
return word
elif ch == 'l':
if word.endswith("ical"):
return word[:-2] if self._m(word, len(word)-5) else word
elif word.endswith("ful"):
return word[:-3] if self._m(word, len(word)-4) else word
else:
return word
elif ch == 's':
if word.endswith("ness"):
return word[:-4] if self._m(word, len(word)-5) else word
else:
return word
else:
return word
def _step4(self, word):
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
return word
ch = word[-2]
if ch == 'a':
if word.endswith("al"):
return word[:-2] if self._m(word, len(word)-3) > 1 else word
else:
return word
elif ch == 'c':
if word.endswith("ance"):
return word[:-4] if self._m(word, len(word)-5) > 1 else word
elif word.endswith("ence"):
return word[:-4] if self._m(word, len(word)-5) > 1 else word
else:
return word
elif ch == 'e':
if word.endswith("er"):
return word[:-2] if self._m(word, len(word)-3) > 1 else word
else:
return word
elif ch == 'i':
if word.endswith("ic"):
return word[:-2] if self._m(word, len(word)-3) > 1 else word
else:
return word
elif ch == 'l':
if word.endswith("able"):
return word[:-4] if self._m(word, len(word)-5) > 1 else word
elif word.endswith("ible"):
return word[:-4] if self._m(word, len(word)-5) > 1 else word
else:
return word
elif ch == 'n':
if word.endswith("ant"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
elif word.endswith("ement"):
return word[:-5] if self._m(word, len(word)-6) > 1 else word
elif word.endswith("ment"):
return word[:-4] if self._m(word, len(word)-5) > 1 else word
elif word.endswith("ent"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
elif ch == 'o':
if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases
return word[:-3] if self._m(word, len(word)-4) > 1 else word
elif word.endswith("ou"):
return word[:-2] if self._m(word, len(word)-3) > 1 else word
else:
return word
elif ch == 's':
if word.endswith("ism"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
elif ch == 't':
if word.endswith("ate"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
elif word.endswith("iti"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
elif ch == 'u':
if word.endswith("ous"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
elif ch == 'v':
if word.endswith("ive"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
elif ch == 'z':
if word.endswith("ize"):
return word[:-3] if self._m(word, len(word)-4) > 1 else word
else:
return word
else:
return word
def _step5(self, word):
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
m() > 1.
"""
if word[-1] == 'e':
a = self._m(word, len(word)-1)
if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)):
word = word[:-1]
if word.endswith('ll') and self._m(word, len(word)-1) > 1:
word = word[:-1]
return word
def stem_word(self, p, i=0, j=None):
"""
Returns the stem of p, or, if i and j are given, the stem of p[i:j+1].
"""
## --NLTK--
if j is None and i == 0:
word = p
else:
if j is None:
j = len(p) - 1
word = p[i:j+1]
if word in self.pool:
return self.pool[word]
if len(word) <= 2:
return word # --DEPARTURE--
# With this line, strings of length 1 or 2 don't go through the
# stemming process, although no mention is made of this in the
# published algorithm. Remove the line to match the published
# algorithm.
word = self._step1ab(word)
word = self._step1c(word)
word = self._step2(word)
word = self._step3(word)
word = self._step4(word)
word = self._step5(word)
return word
def _adjust_case(self, word, stem):
lower = word.lower()
ret = ""
for x in range(len(stem)):
if lower[x] == stem[x]:
ret += word[x]
else:
ret += stem[x]
return ret
## --NLTK--
## Don't use this procedure; we want to work with individual
## tokens, instead. (commented out the following procedure)
#def stem(self, text):
# parts = re.split("(\W+)", text)
# numWords = (len(parts) + 1)/2
#
# ret = ""
# for i in xrange(numWords):
# word = parts[2 * i]
# separator = ""
# if ((2 * i) + 1) < len(parts):
# separator = parts[(2 * i) + 1]
#
# stem = self.stem_word(string.lower(word), 0, len(word) - 1)
# ret = ret + self.adjust_case(word, stem)
# ret = ret + separator
# return ret
## --NLTK--
## Define a stem() method that implements the StemmerI interface.
def stem(self, word):
stem = self.stem_word(word.lower(), 0, len(word) - 1)
return self._adjust_case(word, stem)
## --NLTK--
## Add a string representation function
def __repr__(self):
return '<PorterStemmer>'

File diff suppressed because it is too large Load Diff

@ -0,0 +1,210 @@
english = """
all six eleven just less being indeed over both anyway detail four front already through yourselves fify
mill still its before move whose one system also somewhere herself thick show had enough should to only
seeming under herein ours two has might thereafter do them his around thereby get very de none cannot
every whether they not during thus now him nor name regarding several hereafter did always cry whither
beforehand this someone she each further become thereupon where side towards few twelve because often ten
anyhow doing km eg some back used go namely besides yet are cant our beyond ourselves sincere out even
what throughout computer give for bottom mine since please while per find everything behind does various
above between kg neither seemed ever across t somehow be we who were sixty however here otherwise whereupon
nowhere although found hers re along quite fifteen by on about didn last would anything via of could thence
put against keep etc s became ltd hence therein onto or whereafter con among own co afterwards formerly
within seems into others whatever yourself down alone everyone done least another whoever moreover couldnt
must your three from her their together top there due been next anyone whom much call too interest thru
themselves hundred was until empty more himself elsewhere mostly that fire becomes becoming hereby but
else part everywhere former don with than those he me forty myself made full twenty these bill using up us
will nevertheless below anywhere nine can theirs toward my something and sometimes whenever sometime then
almost wherever is describe am it doesn an really as itself at have in seem whence ie any if again hasnt
inc un thin no perhaps latter meanwhile when amount same wherein beside how other take which latterly you
fill either nobody unless whereas see though may after upon therefore most hereupon eight amongst never
serious nothing such why a off whereby third i whole noone many well except amoungst yours rather without
so five the first having once
"""
spanish = """
un una unas unos uno sobre todo tambien tras otro algun alguno alguna algunos algunas ser es soy eres somos
sois estoy esta estamos estais estan como en para atras porque por que estado estaba ante antes siendo ambos
pero por poder puede puedo podemos podeis pueden fui fue fuimos fueron hacer hago hace hacemos haceis hacen
cada fin incluso primero desde conseguir consigo consigue consigues conseguimos consiguen ir voy va vamos
vais van vaya gueno ha tener tengo tiene tenemos teneis tienen el la lo las los su aqui mio tuyo ellos ellas
nos nosotros vosotros vosotras si dentro solo solamente saber sabes sabe sabemos sabeis saben ultimo largo
bastante haces muchos aquellos aquellas sus entonces tiempo verdad verdadero verdadera cierto ciertos cierta
ciertas intentar intento intenta intentas intentamos intentais intentan dos bajo arriba encima usar uso usas
usa usamos usais usan emplear empleo empleas emplean ampleamos empleais valor muy era eras eramos eran modo
bien cual cuando donde mientras quien con entre sin trabajo trabajar trabajas trabaja trabajamos trabajais
trabajan podria podrias podriamos podrian podriais yo aquel a acabar actualmente acuerdo adelante ademas
ademas adrede afirmo agrego ahi ahora ahi al algo alguna algunas alguno algunos algun alla alli alli alrededor
ambos antano antano ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui
aquel aquella aquellas aquellos aqui arribaabajo aseguro asi asi aun aunque ayer anadio aun b bajo bastante
bien breve buen buena buenas bueno buenos c cada casi cerca cierto cinco claro comento como con conmigo
conocer considera considero contigo contra cosa cosas creo cual cuales cualquier cuando cuanta cuantas cuanto
cuantos cuatro cuenta cuyo cual cuales cuando cuanta cuantas cuanto cuantos como d da dado dan dar de debajo
debe deben deber debido decir dejo del delante demasiado demas dentro deprisa desde despacio despues despues
detras detras dia dias dice dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante dia
dias donde e ejemplo el ella ellas ello ellos embargo en encima encuentra enfrente enseguida entonces entre
era erais eramos eran eras eres es esa esas ese eso esos esta estaba estabais estabamos estaban estabas estad
estada estadas estado estados estais estamos estan estando estar estara estaran estaras estare estareis
estaremos estaria estariais estariamos estarian estarias estara estas este esteis estemos esten estes esto
estos estoy estuve estuviera estuvierais estuvieramos estuvieran estuvieras estuvieron estuviese estuvieseis
estuviesemos estuviesen estuvieses estuvimos estuviste estuvisteis estuvo esta estan ex excepto existe existen
explico expreso f fin final fue fuera fuerais fueramos fueran fueras fueron fuese fueseis fuesemos fuesen
fueses fui fuimos fuiste fuisteis g general gran grande grandes gustar h ha habeis haber habia habiais habiamos
habian habias habida habidas habido habidos habiendo habla hablan habra habran habras habre habreis habremos
habria habriais habriamos habrian habrias habra habia habian hace hacen hacer hacerlo hacia haciendo han has
hasta hay haya hayais hayamos hayan hayas he hecho hemos hicieron hizo horas hoy hube hubiera hubierais
hubieramos hubieran hubieras hubieron hubiese hubieseis hubiesemos hubiesen hubieses hubimos hubiste hubisteis
hubo i igual incluso indico informo informo ir j jamas junto k l la lado las le lejos les llego lleva llevar
lo los luego lugar m mal manera manifesto mas mayor me mediante medio mejor menciono menos menudo mi mia mias
mientras mio mios mis misma mismas mismo mismos momento mucha muchas mucho muchos muy mas mi mia mias mio mios
n nada nadie ni ningun ninguna ningunas ninguno ningunos ningun no nos nosotras nosotros nuestra nuestras
nuestro nuestros nueva nuevas nuevo nuevos nunca o ocho os otra otras otro otros p pais para parece parte
partir pasada pasado pasar pais peor pequeno pero pesar poca pocas poco pocos podemos poder podra podran podria
podrian poner por porque posible primer primera primero primeros principalmente pronto propia propias propio
propios proximo proximo proximos pudo pueda puede pueden pues q qeu que quedo queremos querer quien quienes
quiere quiza quizas quiza quizas quien quienes que r raras realizado realizar realizo repente respecto s saber
salvo se sea seais seamos sean seas seguir segun segunda segundo segun seis senor senora ser sera seran seras
sere sereis seremos seria seriais seriamos serian serias sera seran seria senalo si sido siempre siendo siete
sigue siguiente sin sino sisi sobre sois sola solamente solas solo solos somos son soy soyos su supuesto sus
suya suyas suyo suyos se si solo t tal tambien tambien tampoco tan tanto tarde te temprano tendra tendran
tendras tendre tendreis tendremos tendria tendriais tendriamos tendrian tendrias tendra tendran tened teneis
tenemos tener tenga tengais tengamos tengan tengas tengo tenia teniais teniamos tenian tenias tenida tenidas
tenido tenidos teniendo tenia tercera ti tiene tienen tienes toda todas todavia todavia todo todos tomar total
tras trata traves tres tu tus tuve tuviera tuvierais tuvieramos tuvieran tuvieras tuvieron tuviese tuvieseis
tuviesemos tuviesen tuvieses tuvimos tuviste tuvisteis tuvo tuya tuyas tuyo tuyos tu u un una unas uno unos
usted ustedes v va vamos van varias varios veces venir ver vez volver vosotras vosotros vuestra vuestras vuestro
vuestros w x y ya yo z el esa esas ese esos esta estas este estos ultima ultimas ultimo ultimos
"""
german = """
aber als am an auch auf aus bei bin bis bist da dadurch daher darum das daß dass dein deine dem den der des
dessen deshalb die dies dieser dieses doch dort du durch ein eine einem einen einer eines er es euer eure fur
hatte hatten hattest hattet hierhinter ich ihr ihre im in ist ja jede jedem jeden jeder jedes jener jenes jetzt
kann kannst konnen konnt machen mein meine mit muß mußt musst mussen mußt nach nachdem nein nicht nun oder seid
sein seine sich sie sind soll sollen sollst sollt sonst soweit sowie und unserunsere unter vom von vor wann
warum was weiter weitere wenn wer werde werden werdet weshalb wie wieder wieso wir wird wirst wo woher wohin zu
zum zur uber
"""
portuguese = """
de a o que e do da em um para é com não uma os no se na por mais as dos como mas foi ao ele das tem à seu
sua ou ser quando muito nos está eu também pelo pela até isso ela entre era depois sem mesmo aos ter
seus quem nas me esse eles estão você tinha foram essa num nem suas meu às minha têm numa pelos elas havia seja
qual será nós tenho lhe deles essas esses pelas este fosse dele tu te vocês vos lhes meus minhas teu tua teus
tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está
estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos
estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei havemos hão houve houvemos houveram
houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá
houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos
seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho
tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse
tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam
"""
swedish = """
aderton adertonde adjö aldrig alla allas allt alltid alltså andra andras annan annat artonde artonn att av bakom
bara behöva behövas behövde behövt beslut beslutat beslutit bland blev bli blir blivit bort borta bra bäst bättre
båda bådas dag dagar dagarna dagen de del delen dem den denna deras dess dessa det detta dig din dina dit ditt
dock dom du där därför e efter eftersom ej elfte eller elva emot en enkel enkelt enkla enligt ens er era ers
ert ett ettusen fanns fem femte femtio femtionde femton femtonde fick fin finnas finns fjorton fjortonde fjärde
fler flera flesta fram framför från fyra fyrtio fyrtionde får fått följande för före förlåt förra första
genast genom gick gjorde gjort god goda godare godast gott gälla gäller gällt gärna går gått gör göra ha hade
haft han hans har heller hellre helst helt henne hennes hit hon honom hundra hundraen hundraett hur här hög höger
högre högst i ibland icke idag igen igår imorgon in inför inga ingen ingenting inget innan inne inom inte inuti
ja jag jo ju just jämfört kan kanske knappast kom komma kommer kommit kr kunde kunna kunnat kvar legat ligga
ligger lika likställd likställda lilla lite liten litet länge längre längst lätt lättare lättast långsam
långsammare långsammast långsamt långt låt man med mej mellan men mer mera mest mig min mina mindre minst mitt
mittemot mot mycket många måste möjlig möjligen möjligt möjligtvis ned nederst nedersta nedre nej ner ni nio
nionde nittio nittionde nitton nittonde nog noll nr nu nummer när nästa någon någonting något några nån nånting
nåt nödvändig nödvändiga nödvändigt nödvändigtvis och också ofta oftast olika olikt om oss rakt redan rätt sa
sade sagt samma sedan senare senast sent sex sextio sextionde sexton sextonde sig sin sina sist sista siste sitt
sitta sju sjunde sjuttio sjuttionde sjutton sjuttonde själv sjätte ska skall skulle slutligen små smått snart som
stor stora stort större störst säga säger sämre sämst sådan sådana sådant ta tack tar tidig tidigare tidigast
tidigt till tills tillsammans tio tionde tjugo tjugoen tjugoett tjugonde tjugotre tjugotvå tjungo tolfte tolv tre
tredje trettio trettionde tretton trettonde två tvåhundra under upp ur ursäkt ut utan utanför ute va vad var vara
varför varifrån varit varje varken vars varsågod vart vem vems verkligen vi vid vidare viktig viktigare viktigast
viktigt vilka vilkas vilken vilket vill väl vänster vänstra värre vår våra vårt än ännu är även åt åtminstone
åtta åttio åttionde åttonde över övermorgon överst övre
"""
danish = """
ad af aldrig alle alt anden andet andre at bare begge blev blive bliver da de dem den denne der deres det dette
dig din dine disse dit dog du efter ej eller en end ene eneste enhver er et far fem fik fire flere fleste for
fordi forrige fra får før god godt ham han hans har havde have hej helt hende hendes her hos hun hvad hvem
hver hvilken hvis hvor hvordan hvorfor hvornår i ikke ind ingen intet ja jeg jer jeres jo kan kom komme kommer
kun kunne lad lav lidt lige lille man mand mange med meget men mens mere mig min mine mit mod ned nej ni nogen
noget nogle nu ny nyt når nær næste næsten og også okay om op os otte over se seks selv ser ses sig sige
sin sine sit skal skulle som stor store syv sådan tag tage thi ti til to tre ud under var ved vi vil ville
vor vores være været alene allerede alligevel altid bag blandt burde bør dens derefter derfor derfra deri dermed
derpå derved egen ellers endnu ens enten flest foran først gennem gjorde gjort gør gøre gørende hel heller hen
henover herefter heri hermed herpå hvilke hvilkes hvorefter hvorfra hvorhen hvori hvorimod hvorved igen igennem
imellem imens imod indtil langs lave lavet ligesom længere mellem mest mindre mindst måske nemlig nogensinde nok
omkring overalt samme sammen selvom senere siden stadig synes syntes således temmelig tidligere tilbage tit uden
udover undtagen via vore vær øvrigt
"""
italian = """
un avete dal voi nostri avesti stiano starò sull tutto faccio sarai vostri farebbe ai degli farò c faccia lo
sullo farà facevate avendo fummo stiamo staranno questi sia con sue al mio fareste ero di e avessi alle avreste
avesse alla avrei avemmo col ad ne avremmo avevano tuo avessero siate suoi facevo ti che mi questa avrebbe fossero
tua starebbero faceste facesti anche cui ho tra foste stavamo non stessi avevate nostre quelli queste avrete eri
facemmo stavate stia in dagl avrò avremo se feci furono io stavano nelle quante per abbiano nell faceva fecero steste
eravamo farei sarei avevi sui quanto dai dello era loro su quello fossi stava nostra quale una farete gli siano avranno
i stette fece negli facciano facevano dove vostra farebbero sugli vostro uno aveva dall ha avuto avuti sarete sulla sarà
perché essendo fai siete facendo da avevamo starà o faranno lei mie stiate nel fu facciamo stessero noi facciate stando
si è avute sarebbero miei sto contro avrà coi chi ci avrebbero aveste stettero abbiamo sarebbe agl del stareste sua faremo
siamo fanno sei abbiate fui ed quella dalle facessero tue fosti facevamo erano stessimo nei facessimo nello le dell abbia
fosse farai facesse starai stavo staremo mia stesse avevo lui agli fossimo dagli vostre stanno sareste quanti stemmo facessi
ebbe stesti tuoi dallo tutti sugl staremmo vi la dei quanta ebbero stavi saranno delle dalla saresti staresti stai suo nostro
aremo starete saremmo sarò li hai allo avresti dov avuta faresti starei il quelle degl all a ebbi nella eravate stetti negl
come questo facevi sulle più tu della sono starebbe sul hanno faremmo sta avrai avessimo ma l
"""
# stopwords from https://github.com/bieli/stopwords repository
polish = """
a aby ach acz aczkolwiek aj albo ale alez ależ ani az bardziej bardzo beda bedzie bez deda będą bede będę
będzie bo bowiem by byc być byl byla byli bylo byly był była było były bynajmniej cala cali caly cała cały ci
cie ciebie cię co cokolwiek cos coś czasami czasem czemu czy czyli daleko dla dlaczego dlatego do dobrze
dokad dokąd dosc dość duzo dużo dwa dwaj dwie dwoje dzis dzisiaj dziś gdy gdyby gdyz gdyż gdzie gdziekolwiek
gdzies gdzieś go i ich ile im inna inne inny innych iz ja jak jakas jakaś jakby jaki jakichs jakichś jakie
jakis jakiś jakiz jakiż jakkolwiek jako jakos jakoś je jeden jedna jednak jednakze jednakże jedno jego jej
jemu jesli jest jestem jeszcze jeśli jezeli jeżeli juz już kazdy każdy kiedy kilka kims kimś kto ktokolwiek
ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos ktoś która które którego której który których
którym którzy ku lat lecz lub ma mają mało mam mi miedzy między mimo mna mną mnie moga mogą moi moim moj
moja moje moze mozliwe mozna może możliwe można mój mu musi my na nad nam nami nas nasi nasz nasza nasze
naszego naszych natomiast natychmiast nawet nia nią nic nich nie niech niego niej niemu nigdy nim nimi niz
niż no o obok od około on ona one oni ono oraz oto owszem pan pana pani po pod podczas pomimo ponad poniewaz
ponieważ powinien powinna powinni powinno poza prawie przeciez przecież przed przede przedtem przez przy roku
rowniez również sam sama sie się skad skąd soba sobą sobie sposob sposób swoje ta tak taka taki takie
takze także tam te tego tej ten teraz też to toba tobą tobie totez toteż totobą trzeba tu tutaj twoi twoim
twoj twoja twoje twój twym ty tych tylko tym u w wam wami was wasz wasza wasze we według wiele wielu więc
więcej wlasnie właśnie wszyscy wszystkich wszystkie wszystkim wszystko wtedy wy z za zaden zadna zadne
zadnych zapewne zawsze ze zeby zeznowu znow znowu znów zostal został żaden żadna żadne żadnych że żeby
"""
arabic = """أنت كليكما اللتان بنا هما إذا اللواتي أينما كلاهما إما كيت إذ هم ليس كيف لك هن لئن ألا عليك وإن إليكما أيها لعل أنتن كأي لسن ممن له
حين اللتين فيها عسى ما هي أين ليسا هنا بما عما هاته ذاك لدى هاك نحو بكم ذواتا هذا أقل اللتيا إن مع لكما بكما قد لي أولئك إليك أن كلا
ليسوا بس ذات فيه منها ومن هو بها كأنما هاهنا هاتان هذي ذلك كما أوه هكذا ذوا ليست لكي نعم لكن خلا لكم أنا بخ تي فلا حبذا أولاء
ذواتي منذ ولو بين لكنما سوى آها تلك إي آي إذما الذي كليهما لكيلا لهما بعض يا بكن حيثما وإذا بهما ذا ها فيما ماذا والذين لستما كل
لوما ثمة متى عند في هيهات أما ذان الذين وهو أنتم كي آه ذي إذن إليكم بل فإن وإذ تلكما هلا فإذا هذه ذلكم فمن إلا إنا بمن كذلك هاتين
عليه كأن هل ذلكما مهما شتان والذي هيا ذين لستن بك مذ ولا هذين كأين فيم حتى إنما بهن هنالك أم لسنا غير لنا منه نحن اللاتي بعد تينك
ذلكن ولكن كلما إيه عدا لها هذان ته حاشا دون أنى عن تين أكثر كلتا إنه بيد كذا هاتي ذو لست لم إليكن وما مما إلى ذانك اللذين من مه أف
كم اللائي حيث ليستا هؤلاء بماذا ليت هيت بهم لهن التي لولا لو لهم هناك ثم سوف كيفما لستم لما ذينك بلى لا تلكم على لاسيما به بي اللذان أي ذه لن عل أو ريث أنتما
"""
LANGUAGES = {
"danish": danish,
"english": english,
"german": german,
"spanish": spanish,
"portuguese": portuguese,
"swedish": swedish,
"italian": italian,
"polish": polish,
"arabic": arabic
}
def get_stopwords_by_language(language):
if language in LANGUAGES:
return LANGUAGES[language]
return ""

@ -0,0 +1,188 @@
import string
import unicodedata
import logging
logger = logging.getLogger('summa.preprocessing.cleaner')
try:
from pattern.en import tag
logger.info("'pattern' package found; tag filters are available for English")
HAS_PATTERN = True
except ImportError:
logger.info("'pattern' package not found; tag filters are not available for English")
HAS_PATTERN = False
import re
from .snowball import SnowballStemmer
from .stopwords import get_stopwords_by_language
from summa.syntactic_unit import SyntacticUnit
# Utility functions adapted from Gensim v0.10.0:
# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/utils.py
# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/parsing/preprocessing.py
SEPARATOR = r"@"
RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)')
AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)")
AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)")
AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.")
UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)")
UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)")
STEMMER = None
STOPWORDS = None
def set_stemmer_language(language):
global STEMMER
if not language in SnowballStemmer.languages:
raise ValueError("Valid languages are: " + ", ".join(sorted(SnowballStemmer.languages)))
STEMMER = SnowballStemmer(language)
def set_stopwords_by_language(language, additional_stopwords):
global STOPWORDS
words = get_stopwords_by_language(language)
if not additional_stopwords:
additional_stopwords = {}
STOPWORDS = frozenset({ w for w in words.split() if w } | { w for w in additional_stopwords if w })
def init_textcleanner(language, additional_stopwords):
set_stemmer_language(language)
set_stopwords_by_language(language, additional_stopwords)
def split_sentences(text):
processed = replace_abbreviations(text)
return [undo_replacement(sentence) for sentence in get_sentences(processed)]
def replace_abbreviations(text):
return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
def undo_replacement(sentence):
return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
def replace_with_separator(text, separator, regexs):
replacement = r"\1" + separator + r"\2"
result = text
for regex in regexs:
result = regex.sub(replacement, result)
return result
def get_sentences(text):
for match in RE_SENTENCE.finditer(text):
yield match.group()
# Taken from Gensim
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
def strip_punctuation(s):
return RE_PUNCT.sub(" ", s)
# Taken from Gensim
RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
def strip_numeric(s):
return RE_NUMERIC.sub("", s)
def remove_stopwords(sentence):
return " ".join(w for w in sentence.split() if w not in STOPWORDS)
def stem_sentence(sentence):
word_stems = [STEMMER.stem(word) for word in sentence.split()]
return " ".join(word_stems)
def apply_filters(sentence, filters):
for f in filters:
sentence = f(sentence)
return sentence
def filter_words(sentences):
filters = [lambda x: x.lower(), strip_numeric, strip_punctuation, remove_stopwords,
stem_sentence]
apply_filters_to_token = lambda token: apply_filters(token, filters)
return list(map(apply_filters_to_token, sentences))
# Taken from Gensim
def deaccent(text):
"""
Remove accentuation from the given string.
"""
norm = unicodedata.normalize("NFD", text)
result = "".join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
# Taken from Gensim
PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
def tokenize(text, lowercase=False, deacc=False):
"""
Iteratively yield tokens as unicode strings, optionally also lowercasing them
and removing accent marks.
"""
if lowercase:
text = text.lower()
if deacc:
text = deaccent(text)
for match in PAT_ALPHABETIC.finditer(text):
yield match.group()
def merge_syntactic_units(original_units, filtered_units, tags=None):
units = []
for i in range(len(original_units)):
if filtered_units[i] == '':
continue
text = original_units[i]
token = filtered_units[i]
tag = tags[i][1] if tags else None
sentence = SyntacticUnit(text, token, tag)
sentence.index = i
units.append(sentence)
return units
def clean_text_by_sentences(text, language="english", additional_stopwords=None):
""" Tokenizes a given text into sentences, applying filters and lemmatizing them.
Returns a SyntacticUnit list. """
init_textcleanner(language, additional_stopwords)
original_sentences = split_sentences(text)
filtered_sentences = filter_words(original_sentences)
return merge_syntactic_units(original_sentences, filtered_sentences)
def clean_text_by_word(text, language="english", deacc=False, additional_stopwords=None):
""" Tokenizes a given text into words, applying filters and lemmatizing them.
Returns a dict of word -> syntacticUnit. """
init_textcleanner(language, additional_stopwords)
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc))
filtered_words = filter_words(original_words)
if HAS_PATTERN:
tags = tag(" ".join(original_words)) # tag needs the context of the words in the text
else:
tags = None
units = merge_syntactic_units(original_words, filtered_words, tags)
return { unit.text : unit for unit in units }
def tokenize_by_word(text, deacc=False):
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
return tokenize(text_without_acronyms, lowercase=True, deacc=deacc)

@ -0,0 +1,24 @@
# Natural Language Toolkit: Stemmer Utilities
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Helder <he7d3r@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
def suffix_replace(original, old, new):
"""
Replaces the old suffix of the original string by a new suffix
"""
return original[: -len(old)] + new
def prefix_replace(original, old, new):
"""
Replaces the old prefix of the original string by a new suffix
:param original: string
:param old: string
:param new: string
:return: string
"""
return new + original[len(old) :]

@ -0,0 +1,154 @@
from math import log10
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
def _set_graph_edge_weights(graph):
for sentence_1 in graph.nodes():
for sentence_2 in graph.nodes():
edge = (sentence_1, sentence_2)
if sentence_1 != sentence_2 and not graph.has_edge(edge):
similarity = _get_similarity(sentence_1, sentence_2)
if similarity != 0:
graph.add_edge(edge, similarity)
# Handles the case in which all similarities are zero.
# The resultant summary will consist of random sentences.
if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
_create_valid_graph(graph)
def _create_valid_graph(graph):
nodes = graph.nodes()
for i in range(len(nodes)):
for j in range(len(nodes)):
if i == j:
continue
edge = (nodes[i], nodes[j])
if graph.has_edge(edge):
graph.del_edge(edge)
graph.add_edge(edge, 1)
def _get_similarity(s1, s2):
words_sentence_one = s1.split()
words_sentence_two = s2.split()
common_word_count = _count_common_words(words_sentence_one, words_sentence_two)
log_s1 = log10(len(words_sentence_one))
log_s2 = log10(len(words_sentence_two))
if log_s1 + log_s2 == 0:
return 0
return common_word_count / (log_s1 + log_s2)
def _count_common_words(words_sentence_one, words_sentence_two):
return len(set(words_sentence_one) & set(words_sentence_two))
def _format_results(extracted_sentences, split, score):
if score:
return [(sentence.text, sentence.score) for sentence in extracted_sentences]
if split:
return [sentence.text for sentence in extracted_sentences]
return "\n".join([sentence.text for sentence in extracted_sentences])
def _add_scores_to_sentences(sentences, scores):
for sentence in sentences:
# Adds the score to the object if it has one.
if sentence.token in scores:
sentence.score = scores[sentence.token]
else:
sentence.score = 0
def _get_sentences_with_word_count(sentences, words):
""" Given a list of sentences, returns a list of sentences with a
total word count similar to the word count provided.
"""
word_count = 0
selected_sentences = []
# Loops until the word count is reached.
for sentence in sentences:
words_in_sentence = len(sentence.text.split())
# Checks if the inclusion of the sentence gives a better approximation
# to the word parameter.
if abs(words - word_count - words_in_sentence) > abs(words - word_count):
return selected_sentences
selected_sentences.append(sentence)
word_count += words_in_sentence
return selected_sentences
def _extract_most_important_sentences(sentences, ratio, words):
sentences.sort(key=lambda s: s.score, reverse=True)
# If no "words" option is selected, the number of sentences is
# reduced by the provided ratio.
if words is None:
length = len(sentences) * ratio
return sentences[:int(length)]
# Else, the ratio is ignored.
else:
return _get_sentences_with_word_count(sentences, words)
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph)
# Remove all nodes with all edges weights equal to zero.
_remove_unreachable_nodes(graph)
# PageRank cannot be run in an empty graph.
if len(graph.nodes()) == 0:
return [] if split else ""
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
pagerank_scores = _pagerank(graph)
# Adds the summa scores to the sentence objects.
_add_scores_to_sentences(sentences, pagerank_scores)
# EDIT: return the whole sentences with scores
return sentences
# Extracts the most important sentences with the selected criterion.
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
# Sorts the extracted sentences by apparition order in the original text.
# extracted_sentences.sort(key=lambda s: s.index)
# return _format_results(extracted_sentences, split, scores)
def get_graph(text, language="english"):
sentences = _clean_text_by_sentences(text, language)
graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph)
return graph

@ -0,0 +1,14 @@
class SyntacticUnit(object):
def __init__(self, text, token=None, tag=None):
self.text = text
self.token = token
self.tag = tag[:2] if tag else None # just first two letters of tag
self.index = -1
self.score = -1
def __str__(self):
return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
def __repr__(self):
return str(self)

@ -0,0 +1,97 @@
import argparse
import os
import sys
import warnings
from .summarizer import summarize
from .keywords import keywords
# Types of summarization
SENTENCE = 0
WORD = 1
DEFAULT_RATIO = 0.2
def textrank(text, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None):
if summarize_by == SENTENCE:
return summarize(text, ratio, words, additional_stopwords=additional_stopwords)
else:
return keywords(text, ratio, words, additional_stopwords=additional_stopwords)
def existing_file(file_name):
try:
with open(file_name, 'r') as file:
return file.read()
except Exception:
raise argparse.ArgumentTypeError("The file provided could not be opened.")
def restricted_float(x):
x = float(x)
if x < 0.0 or x > 1.0:
raise argparse.ArgumentTypeError("{} not in range [0.0, 1.0]".format(x))
return x
def parse_args(args):
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="textrank", description="Extract the most relevant sentences or keywords of a given text using the TextRank algorithm.")
group = parser.add_mutually_exclusive_group(required=True)
# New API
group.add_argument('--summarize', metavar="path/to/file", type=existing_file,
help="Run textrank to summarize the input text.")
group.add_argument('--keywords', metavar="path/to/file", type=existing_file,
help="Run textrank to extract keywords from the input text.")
# Old API
group.add_argument('--text', '-t', metavar="path/to/file", type=existing_file,
help="(Deprecated) Text to summarize if --summary option is selected")
parser.add_argument('--summary', '-s', metavar="{0,1}", type=int, choices=[SENTENCE, WORD], default=0,
help="(Deprecated) Type of unit to summarize: sentence (0) or word (1)")
parser.add_argument('--ratio', '-r', metavar="r", type=restricted_float, default=DEFAULT_RATIO,
help="Float number (0,1] that defines the length of the summary. It's a proportion of the original text")
parser.add_argument('--words', '-w', metavar="#words", type=int,
help="Number to limit the length of the summary. The length option is ignored if the word limit is set.")
parser.add_argument('--additional_stopwords', '-a', metavar="list,of,stopwords",
help="Either a string of comma separated stopwords or a path to a file which has comma separated stopwords in every line")
return parser.parse_args(args)
def main():
args = parse_args(sys.argv[1:])
mode = None
text = None
if args.summarize:
text = args.summarize
mode = SENTENCE
elif args.keywords:
text = args.keywords
mode = WORD
elif args.summary: # Old api
warnings.warn("The --summary option is deprecated. Please use either --summarize or --keywords", DeprecationWarning)
text = args.text
mode = args.summary
if text is None:
raise argparse.ArgumentTypeError('Error: no text to summarize provided.')
else:
raise argparse.ArgumentTypeError('Error: --summarize or --keywords is required')
additional_stopwords = None
if args.additional_stopwords:
if os.path.exists(args.additional_stopwords):
with open(args.additional_stopwords) as f:
additional_stopwords = {s for l in f for s in l.strip().split(",")}
else:
additional_stopwords = args.additional_stopwords.split(",")
print(textrank(text, mode, args.ratio, args.words, additional_stopwords))
if __name__ == "__main__":
main()

@ -0,0 +1,31 @@
<!DOCTYPE html>
<html lang="fr">
<head>
<meta charset="UTF-8" />
<title>TextRank Opacity</title>
<meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
<link rel="stylesheet" type="text/css" href="css/main.css" />
<link rel="stylesheet" type="text/css" href="css/typography.css" />
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<main>
{% for s in sentences %}
{{ s.html|safe }}
{% endfor %}
</main>
</body>
</html>

@ -0,0 +1 @@
A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund.

@ -0,0 +1,38 @@
:root{
--lh: 1.35rem;
}
body{
margin: var(--lh);
line-height: var(--lh);
}
@media print{
body{
margin: 0;
font-size: 10pt;
}
}
main{
max-width: 42rem;
margin: 0 auto;
}
/* h1,h2,h3,h4,h5,h6{
line-height: var(--lh);
} */
h1{
text-align: center;
margin: calc(2 * var(--lh)) 0;
}
h2,h3,h4,h5,h6{
margin: calc(3 * var(--lh)) 0 var(--lh);
}
:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){
margin-top: var(--lh);
}

@ -0,0 +1,177 @@
<!DOCTYPE html>
<html lang="fr">
<head>
<meta charset="UTF-8" />
<title>TextRank Opacity</title>
<meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
<link rel="stylesheet" type="text/css" href="css/main.css" />
<link rel="stylesheet" type="text/css" href="css/typography.css" />
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<main>
<h1>Sambucus</h1>
<span style="opacity:0.023354250368401927;">Sambucus is a genus of flowering plants in the family Adoxaceae.</span>
<span style="opacity:0.008019401476129553;">The various species are commonly called elder or elderberry.</span>
<span style="opacity:0.26031502027326375;">The genus was formerly placed in the honeysuckle family, Caprifoliaceae, but was reclassified as Adoxaceae due to genetic and morphological comparisons to plants in the genus Adoxa.</span>
<h2 style="opacity:0.11901683057809066;">Description</h2>
<span style="opacity:0.34993223091241904;">The oppositely arranged leaves are pinnate with 59 leaflets (or, rarely, 3 or 11).</span>
<span style="opacity:0.6657790550844742;">Each leaf is 530 cm (212 in) long, and the leaflets have serrated margins.</span>
<span style="opacity:0.15164144458890563;">They bear large clusters of small white or cream-colored flowers in late spring; these are followed by clusters of small black, blue-black, or red berries (rarely yellow or white).</span>
<h3 style="opacity:0.037512192700824155;">Color</h3>
<span style="opacity:0.002234682837227867;">Sambucus fruit is rich in anthocyanidins that combine to give elderberry juice an intense blue-purple coloration that turns reddish on dilution with water.</span>
<span style="opacity:0.04596217339828307;">These pigments are used as colorants in various products, and "elderberry juice color" is listed by the US FDA as allowable in certified organic food products.</span>
<span style="opacity:0.06433972186696155;">In Japan, elderberry juice is listed as an approved "natural color additive" under the Food and Sanitation Law. Fibers can be dyed with elderberry juice (using alum as a mordant) to give a light "elderberry" color.</span>
<h3 style="opacity:0.005859165534145169;">Toxicity</h3>
<span style="opacity:0.018703350472802083;">Although the cooked berries (pulp and skin) of most species of Sambucus are edible, the uncooked berries and other parts of plants from this genus are poisonous.</span>
<span style="opacity:0.0;">Leaves, twigs, branches, seeds, roots, flowers, and berries of Sambucus plants produce cyanogenic glycosides, which have toxic properties.</span>
<span style="opacity:0.0024907571958084017;">Ingesting a sufficient quantity of cyanogenic glycosides from berry juice, flower tea, or beverages made from fresh leaves, branches, and fruit has been shown to cause illness, including nausea, vomiting, abdominal cramps, diarrhea, and weakness.</span>
<span style="opacity:0.004068285156595224;">In August 1983, a group of 25 people in Monterey County, California, became suddenly ill by ingesting elderberry juice pressed from fresh, uncooked Sambucus mexicana berries, leaves, and stems.</span>
<span style="opacity:0.004033434845520135;">The density of cyanogenic glycosides is higher in tea made from flowers (or leaves) than from the berries.The seeds of Sambucus callicarpa are reported to be poisonous and may cause vomiting or diarrhea.</span>
<h2 style="opacity:0.3047373240294963;">Taxonomy</h2>
<span style="opacity:0.1435576912325227;">The taxonomy of the genus Sambucus L., originally described by Carl Linnaeus and hence its botanical authority, has been complicated by its wide geographical distribution and morphological diversity.</span>
<span style="opacity:0.4903541107710174;">This has led to overdescription of the species and infraspecific taxa (subspecies, varieties or forms).</span>
<span style="opacity:0.28030017008494884;">The name comes from the Greek word sambuce, an ancient wind instrument, about the removal of pith from the twigs to make whistles.Species recognized in this genus are:</span>
<h2 style="opacity:0.3774205134479782;">Distribution and habitat</h2>
<span style="opacity:0.5170795700859395;">The genus occurs in temperate to subtropical regions of the world.</span>
<span style="opacity:0.7289657600921431;">More widespread in the Northern Hemisphere, its Southern Hemisphere occurrence is restricted to parts of Australasia and South America.</span>
<span style="opacity:0.004931060704500269;">Many species are widely cultivated for their ornamental leaves, flowers, and fruit.</span>
<h3 style="opacity:0.34774180081331607;">Habitat</h3>
<span style="opacity:0.23667717197372362;">Elder commonly grows near farms and homesteads.</span>
<span style="opacity:0.3192589413693254;">It is a nitrogen-dependent plant and thus is generally found near places of organic waste disposal.</span>
<span style="opacity:0.20272560262481226;">Elders are often grown as a hedgerow plant in Britain since they take very fast, can be bent into shape easily, and grow quite profusely, thus having gained the reputation of being 'an instant hedge'.</span>
<span style="opacity:0.632305487285403;">It is not generally affected by soil type or pH level and will virtually grow anywhere sufficient sunlight is available.</span>
<h2 style="opacity:0.36011640223198155;">Ecology</h2>
<span style="opacity:0.09934447961441183;">In Northern California, elderberries are a food for migrating band-tailed pigeons.</span>
<span style="opacity:0.10833689405967695;">Elders are used as food plants by the larvae of some Lepidoptera species including brown-tail, buff ermine, dot moth, emperor moth, engrailed moth, swallow-tailed moth and the V-pug.</span>
<span style="opacity:0.40509224339149436;">The crushed foliage and immature fruit have a strong fetid smell.</span>
<span style="opacity:0.16493413805985815;">Valley elderberry longhorn beetles in California are very often found around red or blue elderberry bushes.</span>
<span style="opacity:1;">Females lay their eggs on the bark.</span>
<span style="opacity:0.38299065826644807;">The pith of elder has been used by watchmakers for cleaning tools before intricate work.</span>
<h2 style="opacity:0.4959298303208725;">Cultivation</h2>
<span style="opacity:0.0023273321750337233;">Traditional uses of Sambucus involved berries, seeds, leaves, and flowers or component extracts.</span>
<span style="opacity:0.021538616933372428;">Ornamental varieties of Sambucus are grown in gardens for their showy flowers, fruits and lacy foliage which support habitat for wildlife.</span>
<span style="opacity:0.37967191922582566;">Of the many native species, three are used as ornamentals, S.</span>
<span style="opacity:0.5720411135910031;">nigra, S.</span>
<span style="opacity:1;">canadensis and S.</span>
<span style="opacity:1;">racemosa.</span>
<h2 style="opacity:0.26037935627574993;">Uses</h2>
<h3 style="opacity:0.17679277695747428;">Nutrition</h3>
<span style="opacity:0.16562834706461427;">Raw elderberries are 80% water, 18% carbohydrates, and less than 1% each of protein and fat (table).</span>
<span style="opacity:0.1368265507355418;">In a 100-gram (3+12 oz) amount, elderberries supply 305 kilojoules (73 kcal) of food energy and are a rich source of vitamin C, providing 43% of the Daily Value (DV).</span>
<span style="opacity:0.2279234330722667;">Elderberries also have moderate contents of vitamin B6 (18% DV) and iron (12% DV), with no other nutrients in significant content.</span>
<h3 style="opacity:0.2385522392706748;">Dietary supplement</h3>
<span style="opacity:0.007324180292626893;">Elderberry fruit or flowers are used as dietary supplements to prevent or provide relief from minor diseases, such as flu, colds, constipation, and other conditions, served as a tea, extract or in a capsule.</span>
<span style="opacity:0.07793442880325537;">The use of elderberry supplements increased early in the COVID-19 pandemic.</span>
<span style="opacity:0.5690343662424139;">There is insufficient research to establish its effectiveness for such uses, or its safety profile.</span>
<span style="opacity:0.3046053054729051;">The raw or unripe fruit of S.</span>
<span style="opacity:0.23386291554217284;">nigra or its extracts may contain a cyanogenic glycoside that is potentially toxic.</span>
<h3 style="opacity:0.30036772549577245;">Traditional medicine</h3>
<span style="opacity:0.12635509010408633;">Although practitioners of traditional medicine have used elderberry over centuries, there is no high-quality clinical evidence that such practices provide any benefit.</span>
<span style="opacity:0.04619027362780958;">The flowers of Sambucus nigra are used to produce elderflower cordial.</span>
<span style="opacity:0.5605661316829006;">St-Germain, a French liqueur, is made from elderflowers.</span>
<span style="opacity:0.6249597590948577;">Hallands Fläder, a Swedish akvavit, is flavoured with elderflowers.</span>
<span style="opacity:0.058473741273256635;">Hollowed elderberry twigs have traditionally been used as spiles to tap maple trees for syrup.</span>
<span style="opacity:0.36742927908821876;">Additionally, they have been hollowed out and used as flutes, blowguns, and syringes.The fruit of S.</span>
<span style="opacity:0.7665194357270494;">callicarpa is eaten by birds and mammals.</span>
<span style="opacity:0.04897270752302531;">It is inedible to humans when raw but can be made into wine.Elderberry twigs and fruit are employed in creating dyes for basketry.</span>
<span style="opacity:0.10384311134074758;">These stems are dyed a very deep black by soaking them in a wash made from the berry stems of the elderberry.</span>
<h2 style="opacity:0.1940215897275913;">In popular culture</h2>
<span style="opacity:0.26902332053709666;">Folklore related to elder trees is extensive and can vary according to region.</span>
<span style="opacity:0.042182724177350944;">In some traditions, the elder tree is thought to ward off evil and give protection from witches, while other beliefs say that witches often congregate under the plant, especially when it is full of fruit.</span>
<span style="opacity:0.24981940690065454;">If an elder tree was cut down, a spirit known as the Elder Mother would be released and take her revenge.</span>
<span style="opacity:0.18680917834736657;">The tree could only safely be cut while chanting a rhyme to the Elder Mother.Made from the branch of an elder tree, the Elder Wand plays a pivotal role in the final book of the Harry Potter series, which was nearly named Harry Potter and the Elder Wand before author J.</span>
<span style="opacity:0.22868031954081833;">K. Rowling decided on Harry Potter and the Deathly Hallows.Elton John's 1973 album Don't Shoot Me I'm Only the Piano Player features a song titled "Elderberry Wine".</span>
<span style="opacity:0.1876145888622608;">In Monty Python and the Holy Grail, John Cleese as the French Taunter tells the knights of Camelot, "Your mother was a hamster, and your father smelt of elderberries."</span>
<h2 style="opacity:NaN;">Gallery</h2>
</main>
</body>
</html>
Loading…
Cancel
Save