first experiment with opacity
commit
b8343c650f
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
opacity experiment using:
|
||||||
|
* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score.
|
||||||
|
* wikipedia python module (https://pypi.org/project/wikipedia/)
|
@ -0,0 +1,233 @@
|
|||||||
|
from jinja2 import Template
|
||||||
|
import os
|
||||||
|
import wikipedia
|
||||||
|
from markdown import markdown
|
||||||
|
|
||||||
|
# importing module
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# appending a path
|
||||||
|
# sys.path.append('textrank')
|
||||||
|
|
||||||
|
# importing required module
|
||||||
|
import summa.summarizer
|
||||||
|
from summa.summarizer import summarize
|
||||||
|
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# * DONE: wiki header
|
||||||
|
|
||||||
|
# those 3 would ask to start from the HTML itself and keep and index...
|
||||||
|
# * wiki paragraph
|
||||||
|
# * wiki hyperlinks
|
||||||
|
# * list
|
||||||
|
|
||||||
|
|
||||||
|
# variables
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# wikipedia_page = "forest"
|
||||||
|
# wikipedia_page = "warehouse"
|
||||||
|
# wikipedia_page = "river"
|
||||||
|
wikipedia_page = "elderflower"
|
||||||
|
# wikipedia_page = "mushroom"
|
||||||
|
|
||||||
|
TEMPLATE_PATH = 'template.html'
|
||||||
|
HTML_PATH = 'www/index.html'
|
||||||
|
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def map_value(value, min, max, new_min, new_max):
|
||||||
|
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
|
||||||
|
|
||||||
|
def remap_score(s, min_score, max_score):
|
||||||
|
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def compress_score(s):
|
||||||
|
|
||||||
|
# compress whites
|
||||||
|
s.score = s.score**3
|
||||||
|
|
||||||
|
# stretch + limiter
|
||||||
|
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
|
||||||
|
s.score = 1 if s.score > 0.8 else s.score
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# wikipedia
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def wikipage(pagename):
|
||||||
|
# get wikipedia page content by name of the page
|
||||||
|
|
||||||
|
print(pagename)
|
||||||
|
wikipedia.set_lang("en")
|
||||||
|
try:
|
||||||
|
results = wikipedia.search(pagename, results=1, suggestion=False)
|
||||||
|
try:
|
||||||
|
pagename = results[0]
|
||||||
|
except IndexError:
|
||||||
|
# if there is no suggestion or search results, the page doesn't exist
|
||||||
|
raise wikipedia.PageError(pagename)
|
||||||
|
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
|
||||||
|
except wikipedia.exceptions.DisambiguationError as e:
|
||||||
|
print(e.options)
|
||||||
|
page = ''
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# parsing and gluing html
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def is_header(s):
|
||||||
|
|
||||||
|
# i is the header level
|
||||||
|
i = 0
|
||||||
|
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if i > 0:
|
||||||
|
header_text = s.text[i:(-1-i)].strip()
|
||||||
|
header_level = i
|
||||||
|
return [header_text, header_level]
|
||||||
|
|
||||||
|
def wiki_parse(sentences):
|
||||||
|
|
||||||
|
# TODO: doesn't work with section nesting!!
|
||||||
|
# 1. replace wikitext header with html header
|
||||||
|
# 2. add the opacity to each elements
|
||||||
|
# 3. compute an artificial score for header that is an average of the score of the section
|
||||||
|
|
||||||
|
new_sentences = []
|
||||||
|
|
||||||
|
print('--- HEADERS ---')
|
||||||
|
for i in range(len(sentences)):
|
||||||
|
|
||||||
|
s = sentences[i]
|
||||||
|
|
||||||
|
# if sentences is header
|
||||||
|
header = is_header(s)
|
||||||
|
if header:
|
||||||
|
print(header[0])
|
||||||
|
|
||||||
|
# start computing the average of score of this section
|
||||||
|
current_total = 0
|
||||||
|
current_count = 0
|
||||||
|
next_header_found = False
|
||||||
|
j = i + 1
|
||||||
|
|
||||||
|
# iterating while we find next header with greatest or same level
|
||||||
|
while j < len(sentences) and not next_header_found:
|
||||||
|
|
||||||
|
s2 = sentences[j]
|
||||||
|
s2_header = is_header(s2)
|
||||||
|
|
||||||
|
if s2_header:
|
||||||
|
print(' ' + s2_header[0])
|
||||||
|
if header[1] >= s2_header[1]:
|
||||||
|
# encounter header of higher level
|
||||||
|
next_header_found = True
|
||||||
|
print('X ' + s2_header[0])
|
||||||
|
|
||||||
|
else:
|
||||||
|
# adding every sentence to the average
|
||||||
|
current_total += s2.score
|
||||||
|
current_count += 1
|
||||||
|
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
if current_count != 0:
|
||||||
|
s.score = current_total / current_count
|
||||||
|
else:
|
||||||
|
s.score = "NaN"
|
||||||
|
|
||||||
|
s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
|
||||||
|
|
||||||
|
# stops at the references part
|
||||||
|
if header[0] == "References" or header[0] == "See also":
|
||||||
|
break
|
||||||
|
|
||||||
|
new_sentences.append(s)
|
||||||
|
|
||||||
|
# not a header
|
||||||
|
else:
|
||||||
|
s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
|
||||||
|
new_sentences.append(s)
|
||||||
|
|
||||||
|
return new_sentences
|
||||||
|
|
||||||
|
|
||||||
|
# textrank
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def txt2rankedsentences(txt):
|
||||||
|
# from txt to ranked sentences
|
||||||
|
return summarize(txt, split=True)
|
||||||
|
|
||||||
|
|
||||||
|
# main
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|
||||||
|
# --- WIKI REQUEST ---
|
||||||
|
|
||||||
|
# get text from wikipedia
|
||||||
|
print('--- WIKI ---')
|
||||||
|
page = wikipage(wikipedia_page)
|
||||||
|
if not page:
|
||||||
|
sys.exit("--- STOP ---")
|
||||||
|
title = '<h1>'+page.title+'</h1>'
|
||||||
|
text = page.content
|
||||||
|
|
||||||
|
# print text in terminal
|
||||||
|
print('--- TXT ---')
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
|
||||||
|
# --- APPLY TEXTRANK ---
|
||||||
|
|
||||||
|
# apply textrank
|
||||||
|
sentences = txt2rankedsentences(text)
|
||||||
|
|
||||||
|
# print ranked sentences in terminal
|
||||||
|
print('--- SENTENCES ---')
|
||||||
|
for s in sentences:
|
||||||
|
print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
|
||||||
|
|
||||||
|
|
||||||
|
# --- REMAP AND COMPRESS ---
|
||||||
|
|
||||||
|
# sorted version of the list
|
||||||
|
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
|
||||||
|
# remap sentences from 0 to 1
|
||||||
|
max_score = sorted_sentences[0].score
|
||||||
|
min_score = sorted_sentences[-1].score
|
||||||
|
sentences = [remap_score(s, min_score, max_score) for s in sentences]
|
||||||
|
# compress scores (make more stuff invisible)
|
||||||
|
sentences = [compress_score(s) for s in sentences]
|
||||||
|
|
||||||
|
|
||||||
|
# -- PARSE ---
|
||||||
|
|
||||||
|
# parse every sentences to either span or header
|
||||||
|
sentences = wiki_parse(sentences)
|
||||||
|
# add back page title
|
||||||
|
sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
|
||||||
|
|
||||||
|
|
||||||
|
# -- TEMPLATING ---
|
||||||
|
|
||||||
|
# getting the template
|
||||||
|
with open(TEMPLATE_PATH, 'r') as file:
|
||||||
|
template = Template(file.read())
|
||||||
|
# render template
|
||||||
|
html = template.render(sentences = sentences)
|
||||||
|
with open(HTML_PATH, 'w') as file:
|
||||||
|
file.write(html)
|
@ -0,0 +1,2 @@
|
|||||||
|
from summa import commons, graph, keywords, pagerank_weighted, \
|
||||||
|
summarizer, syntactic_unit, textrank
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,15 @@
|
|||||||
|
from .graph import Graph
|
||||||
|
|
||||||
|
|
||||||
|
def build_graph(sequence):
|
||||||
|
graph = Graph()
|
||||||
|
for item in sequence:
|
||||||
|
if not graph.has_node(item):
|
||||||
|
graph.add_node(item)
|
||||||
|
return graph
|
||||||
|
|
||||||
|
|
||||||
|
def remove_unreachable_nodes(graph):
|
||||||
|
for node in graph.nodes():
|
||||||
|
if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0:
|
||||||
|
graph.del_node(node)
|
@ -0,0 +1,2 @@
|
|||||||
|
class TextrankRuntimeError(RuntimeError):
|
||||||
|
pass
|
@ -0,0 +1,244 @@
|
|||||||
|
from abc import ABCMeta, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class IGraph(metaclass=ABCMeta):
|
||||||
|
"""
|
||||||
|
Represents the interface or contract that the graph for TextRank should implement
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def nodes(self):
|
||||||
|
"""
|
||||||
|
Return node list.
|
||||||
|
|
||||||
|
@rtype: list
|
||||||
|
@return: Node list.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def edges(self):
|
||||||
|
"""
|
||||||
|
Return all edges in the graph.
|
||||||
|
|
||||||
|
@rtype: list
|
||||||
|
@return: List of all edges in the graph.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def neighbors(self, node):
|
||||||
|
"""
|
||||||
|
Return all nodes that are directly accessible from given node.
|
||||||
|
|
||||||
|
@type node: node
|
||||||
|
@param node: Node identifier
|
||||||
|
|
||||||
|
@rtype: list
|
||||||
|
@return: List of nodes directly accessible from given node.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def has_node(self, node):
|
||||||
|
"""
|
||||||
|
Return whether the requested node exists.
|
||||||
|
|
||||||
|
@type node: node
|
||||||
|
@param node: Node identifier
|
||||||
|
|
||||||
|
@rtype: boolean
|
||||||
|
@return: Truth-value for node existence.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_node(self, node, attrs=None):
|
||||||
|
"""
|
||||||
|
Add given node to the graph.
|
||||||
|
|
||||||
|
@attention: While nodes can be of any type, it's strongly recommended to use only
|
||||||
|
numbers and single-line strings as node identifiers if you intend to use write().
|
||||||
|
|
||||||
|
@type node: node
|
||||||
|
@param node: Node identifier.
|
||||||
|
|
||||||
|
@type attrs: list
|
||||||
|
@param attrs: List of node attributes specified as (attribute, value) tuples.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_edge(self, edge, wt=1, label='', attrs=[]):
|
||||||
|
"""
|
||||||
|
Add an edge to the graph connecting two nodes.
|
||||||
|
|
||||||
|
An edge, here, is a pair of nodes like C{(n, m)}.
|
||||||
|
|
||||||
|
@type edge: tuple
|
||||||
|
@param edge: Edge.
|
||||||
|
|
||||||
|
@type wt: number
|
||||||
|
@param wt: Edge weight.
|
||||||
|
|
||||||
|
@type label: string
|
||||||
|
@param label: Edge label.
|
||||||
|
|
||||||
|
@type attrs: list
|
||||||
|
@param attrs: List of node attributes specified as (attribute, value) tuples.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def has_edge(self, edge):
|
||||||
|
"""
|
||||||
|
Return whether an edge exists.
|
||||||
|
|
||||||
|
@type edge: tuple
|
||||||
|
@param edge: Edge.
|
||||||
|
|
||||||
|
@rtype: boolean
|
||||||
|
@return: Truth-value for edge existence.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def edge_weight(self, edge):
|
||||||
|
"""
|
||||||
|
Get the weight of an edge.
|
||||||
|
|
||||||
|
@type edge: edge
|
||||||
|
@param edge: One edge.
|
||||||
|
|
||||||
|
@rtype: number
|
||||||
|
@return: Edge weight.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def del_node(self, node):
|
||||||
|
"""
|
||||||
|
Remove a node from the graph.
|
||||||
|
|
||||||
|
@type node: node
|
||||||
|
@param node: Node identifier.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Graph(IGraph):
|
||||||
|
"""
|
||||||
|
Implementation of an undirected graph, based on Pygraph
|
||||||
|
"""
|
||||||
|
|
||||||
|
WEIGHT_ATTRIBUTE_NAME = "weight"
|
||||||
|
DEFAULT_WEIGHT = 0
|
||||||
|
|
||||||
|
LABEL_ATTRIBUTE_NAME = "label"
|
||||||
|
DEFAULT_LABEL = ""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Metadata about edges
|
||||||
|
self.edge_properties = {} # Mapping: Edge -> Dict mapping, lablel-> str, wt->num
|
||||||
|
self.edge_attr = {} # Key value pairs: (Edge -> Attributes)
|
||||||
|
# Metadata about nodes
|
||||||
|
self.node_attr = {} # Pairing: Node -> Attributes
|
||||||
|
self.node_neighbors = {} # Pairing: Node -> Neighbors
|
||||||
|
|
||||||
|
def has_edge(self, edge):
|
||||||
|
u,v = edge
|
||||||
|
return (u,v) in self.edge_properties and (v,u) in self.edge_properties
|
||||||
|
|
||||||
|
def edge_weight(self, edge):
|
||||||
|
return self.get_edge_properties( edge ).setdefault( self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT )
|
||||||
|
|
||||||
|
def neighbors(self, node):
|
||||||
|
return self.node_neighbors[node]
|
||||||
|
|
||||||
|
def has_node(self, node):
|
||||||
|
return node in self.node_neighbors
|
||||||
|
|
||||||
|
def add_edge(self, edge, wt=1, label='', attrs=[]):
|
||||||
|
u, v = edge
|
||||||
|
if (v not in self.node_neighbors[u] and u not in self.node_neighbors[v]):
|
||||||
|
self.node_neighbors[u].append(v)
|
||||||
|
if (u != v):
|
||||||
|
self.node_neighbors[v].append(u)
|
||||||
|
|
||||||
|
self.add_edge_attributes((u,v), attrs)
|
||||||
|
self.set_edge_properties((u, v), label=label, weight=wt)
|
||||||
|
else:
|
||||||
|
raise ValueError("Edge (%s, %s) already in graph" % (u, v))
|
||||||
|
|
||||||
|
def add_node(self, node, attrs=None):
|
||||||
|
if attrs is None:
|
||||||
|
attrs = []
|
||||||
|
if (not node in self.node_neighbors):
|
||||||
|
self.node_neighbors[node] = []
|
||||||
|
self.node_attr[node] = attrs
|
||||||
|
else:
|
||||||
|
raise ValueError("Node %s already in graph" % node)
|
||||||
|
|
||||||
|
def nodes(self):
|
||||||
|
return list(self.node_neighbors.keys())
|
||||||
|
|
||||||
|
def edges(self):
|
||||||
|
return [ a for a in list(self.edge_properties.keys()) ]
|
||||||
|
|
||||||
|
def del_node(self, node):
|
||||||
|
for each in list(self.neighbors(node)):
|
||||||
|
if (each != node):
|
||||||
|
self.del_edge((each, node))
|
||||||
|
del(self.node_neighbors[node])
|
||||||
|
del(self.node_attr[node])
|
||||||
|
|
||||||
|
# Helper methods
|
||||||
|
def get_edge_properties(self, edge):
|
||||||
|
return self.edge_properties.setdefault( edge, {} )
|
||||||
|
|
||||||
|
def add_edge_attributes(self, edge, attrs):
|
||||||
|
for attr in attrs:
|
||||||
|
self.add_edge_attribute(edge, attr)
|
||||||
|
|
||||||
|
def add_edge_attribute(self, edge, attr):
|
||||||
|
self.edge_attr[edge] = self.edge_attributes(edge) + [attr]
|
||||||
|
|
||||||
|
if (edge[0] != edge[1]):
|
||||||
|
self.edge_attr[(edge[1],edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr]
|
||||||
|
|
||||||
|
def edge_attributes(self, edge):
|
||||||
|
try:
|
||||||
|
return self.edge_attr[edge]
|
||||||
|
except KeyError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def set_edge_properties(self, edge, **properties ):
|
||||||
|
self.edge_properties.setdefault( edge, {} ).update( properties )
|
||||||
|
if (edge[0] != edge[1]):
|
||||||
|
self.edge_properties.setdefault((edge[1], edge[0]), {}).update( properties )
|
||||||
|
|
||||||
|
def del_edge(self, edge):
|
||||||
|
u, v = edge
|
||||||
|
self.node_neighbors[u].remove(v)
|
||||||
|
self.del_edge_labeling((u, v))
|
||||||
|
if (u != v):
|
||||||
|
self.node_neighbors[v].remove(u)
|
||||||
|
self.del_edge_labeling((v, u)) # TODO: This is redundant
|
||||||
|
|
||||||
|
def del_edge_labeling( self, edge ):
|
||||||
|
keys = [edge]
|
||||||
|
keys.append(edge[::-1])
|
||||||
|
|
||||||
|
for key in keys:
|
||||||
|
for mapping in [self.edge_properties, self.edge_attr ]:
|
||||||
|
try:
|
||||||
|
del ( mapping[key] )
|
||||||
|
except KeyError:
|
||||||
|
pass
|
@ -0,0 +1,227 @@
|
|||||||
|
from itertools import combinations as _combinations
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
|
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
|
||||||
|
from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
|
||||||
|
from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
|
||||||
|
from .commons import build_graph as _build_graph
|
||||||
|
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
||||||
|
|
||||||
|
WINDOW_SIZE = 2
|
||||||
|
|
||||||
|
"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
|
||||||
|
Example: filter for nouns and adjectives:
|
||||||
|
INCLUDING_FILTER = ['NN', 'JJ']"""
|
||||||
|
INCLUDING_FILTER = ['NN', 'JJ']
|
||||||
|
EXCLUDING_FILTER = []
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pos_filters():
|
||||||
|
return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_words_for_graph(tokens):
|
||||||
|
include_filters, exclude_filters = _get_pos_filters()
|
||||||
|
if include_filters and exclude_filters:
|
||||||
|
raise ValueError("Can't use both include and exclude filters, should use only one")
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for word, unit in tokens.items():
|
||||||
|
if exclude_filters and unit.tag in exclude_filters:
|
||||||
|
continue
|
||||||
|
if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
|
||||||
|
result.append(unit.token)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_first_window(split_text):
|
||||||
|
return split_text[:WINDOW_SIZE]
|
||||||
|
|
||||||
|
|
||||||
|
def _set_graph_edge(graph, tokens, word_a, word_b):
|
||||||
|
if word_a in tokens and word_b in tokens:
|
||||||
|
lemma_a = tokens[word_a].token
|
||||||
|
lemma_b = tokens[word_b].token
|
||||||
|
edge = (lemma_a, lemma_b)
|
||||||
|
|
||||||
|
if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
|
||||||
|
graph.add_edge(edge)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_first_window(graph, tokens, split_text):
|
||||||
|
first_window = _get_first_window(split_text)
|
||||||
|
for word_a, word_b in _combinations(first_window, 2):
|
||||||
|
_set_graph_edge(graph, tokens, word_a, word_b)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_queue(split_text):
|
||||||
|
queue = Queue()
|
||||||
|
first_window = _get_first_window(split_text)
|
||||||
|
for word in first_window[1:]:
|
||||||
|
queue.put(word)
|
||||||
|
return queue
|
||||||
|
|
||||||
|
|
||||||
|
def _process_word(graph, tokens, queue, word):
|
||||||
|
for word_to_compare in _queue_iterator(queue):
|
||||||
|
_set_graph_edge(graph, tokens, word, word_to_compare)
|
||||||
|
|
||||||
|
|
||||||
|
def _update_queue(queue, word):
|
||||||
|
queue.get()
|
||||||
|
queue.put(word)
|
||||||
|
assert queue.qsize() == (WINDOW_SIZE - 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_text(graph, tokens, split_text):
|
||||||
|
queue = _init_queue(split_text)
|
||||||
|
for i in range(WINDOW_SIZE, len(split_text)):
|
||||||
|
word = split_text[i]
|
||||||
|
_process_word(graph, tokens, queue, word)
|
||||||
|
_update_queue(queue, word)
|
||||||
|
|
||||||
|
|
||||||
|
def _queue_iterator(queue):
|
||||||
|
iterations = queue.qsize()
|
||||||
|
for i in range(iterations):
|
||||||
|
var = queue.get()
|
||||||
|
yield var
|
||||||
|
queue.put(var)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_graph_edges(graph, tokens, split_text):
|
||||||
|
_process_first_window(graph, tokens, split_text)
|
||||||
|
_process_text(graph, tokens, split_text)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_tokens(lemmas, scores, ratio, words):
|
||||||
|
lemmas.sort(key=lambda s: scores[s], reverse=True)
|
||||||
|
|
||||||
|
# If no "words" option is selected, the number of sentences is
|
||||||
|
# reduced by the provided ratio, else, the ratio is ignored.
|
||||||
|
length = len(lemmas) * ratio if words is None else words
|
||||||
|
return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]
|
||||||
|
|
||||||
|
|
||||||
|
def _lemmas_to_words(tokens):
|
||||||
|
lemma_to_word = {}
|
||||||
|
for word, unit in tokens.items():
|
||||||
|
lemma = unit.token
|
||||||
|
if lemma in lemma_to_word:
|
||||||
|
lemma_to_word[lemma].append(word)
|
||||||
|
else:
|
||||||
|
lemma_to_word[lemma] = [word]
|
||||||
|
return lemma_to_word
|
||||||
|
|
||||||
|
|
||||||
|
def _get_keywords_with_score(extracted_lemmas, lemma_to_word):
|
||||||
|
"""
|
||||||
|
:param extracted_lemmas:list of tuples
|
||||||
|
:param lemma_to_word: dict of {lemma:list of words}
|
||||||
|
:return: dict of {keyword:score}
|
||||||
|
"""
|
||||||
|
keywords = {}
|
||||||
|
for score, lemma in extracted_lemmas:
|
||||||
|
keyword_list = lemma_to_word[lemma]
|
||||||
|
for keyword in keyword_list:
|
||||||
|
keywords[keyword] = score
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_word(word):
|
||||||
|
stripped_word_list = list(_tokenize_by_word(word))
|
||||||
|
return stripped_word_list[0] if stripped_word_list else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _get_combined_keywords(_keywords, split_text):
|
||||||
|
"""
|
||||||
|
:param keywords:dict of keywords:scores
|
||||||
|
:param split_text: list of strings
|
||||||
|
:return: combined_keywords:list
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
_keywords = _keywords.copy()
|
||||||
|
len_text = len(split_text)
|
||||||
|
for i in range(len_text):
|
||||||
|
word = _strip_word(split_text[i])
|
||||||
|
if word in _keywords:
|
||||||
|
combined_word = [word]
|
||||||
|
if i + 1 == len_text:
|
||||||
|
result.append(word) # appends last word if keyword and doesn't iterate
|
||||||
|
for j in range(i + 1, len_text):
|
||||||
|
other_word = _strip_word(split_text[j])
|
||||||
|
if other_word in _keywords and other_word == split_text[j] \
|
||||||
|
and other_word not in combined_word:
|
||||||
|
combined_word.append(other_word)
|
||||||
|
else:
|
||||||
|
for keyword in combined_word:
|
||||||
|
_keywords.pop(keyword)
|
||||||
|
result.append(" ".join(combined_word))
|
||||||
|
break
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_average_score(concept, _keywords):
|
||||||
|
word_list = concept.split()
|
||||||
|
word_counter = 0
|
||||||
|
total = 0
|
||||||
|
for word in word_list:
|
||||||
|
total += _keywords[word]
|
||||||
|
word_counter += 1
|
||||||
|
return total / word_counter
|
||||||
|
|
||||||
|
|
||||||
|
def _format_results(_keywords, combined_keywords, split, scores):
|
||||||
|
"""
|
||||||
|
:param keywords:dict of keywords:scores
|
||||||
|
:param combined_keywords:list of word/s
|
||||||
|
"""
|
||||||
|
combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
|
||||||
|
if scores:
|
||||||
|
return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
|
||||||
|
if split:
|
||||||
|
return combined_keywords
|
||||||
|
return "\n".join(combined_keywords)
|
||||||
|
|
||||||
|
|
||||||
|
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False, deaccent=False, additional_stopwords=None):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
||||||
|
|
||||||
|
# Gets a dict of word -> lemma
|
||||||
|
tokens = _clean_text_by_word(text, language, deacc=deaccent, additional_stopwords=additional_stopwords)
|
||||||
|
split_text = list(_tokenize_by_word(text))
|
||||||
|
|
||||||
|
# Creates the graph and adds the edges
|
||||||
|
graph = _build_graph(_get_words_for_graph(tokens))
|
||||||
|
_set_graph_edges(graph, tokens, split_text)
|
||||||
|
del split_text # It's no longer used
|
||||||
|
|
||||||
|
_remove_unreachable_nodes(graph)
|
||||||
|
|
||||||
|
# PageRank cannot be run in an empty graph.
|
||||||
|
if len(graph.nodes()) == 0:
|
||||||
|
return [] if split else ""
|
||||||
|
|
||||||
|
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
|
||||||
|
pagerank_scores = _pagerank(graph)
|
||||||
|
|
||||||
|
extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
|
||||||
|
|
||||||
|
lemmas_to_word = _lemmas_to_words(tokens)
|
||||||
|
keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
|
||||||
|
|
||||||
|
# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
|
||||||
|
combined_keywords = _get_combined_keywords(keywords, text.split())
|
||||||
|
|
||||||
|
return _format_results(keywords, combined_keywords, split, scores)
|
||||||
|
|
||||||
|
|
||||||
|
def get_graph(text, language="english", deaccent=False):
|
||||||
|
tokens = _clean_text_by_word(text, language, deacc=deaccent)
|
||||||
|
split_text = list(_tokenize_by_word(text, deacc=deaccent))
|
||||||
|
|
||||||
|
graph = _build_graph(_get_words_for_graph(tokens))
|
||||||
|
_set_graph_edges(graph, tokens, split_text)
|
||||||
|
|
||||||
|
return graph
|
@ -0,0 +1,86 @@
|
|||||||
|
from scipy.sparse import csr_matrix
|
||||||
|
from scipy.linalg import eig
|
||||||
|
from numpy import empty as empty_matrix
|
||||||
|
|
||||||
|
CONVERGENCE_THRESHOLD = 0.0001
|
||||||
|
|
||||||
|
|
||||||
|
def pagerank_weighted(graph, initial_value=None, damping=0.85):
|
||||||
|
"""Calculates PageRank for an undirected graph"""
|
||||||
|
if initial_value == None: initial_value = 1.0 / len(graph.nodes())
|
||||||
|
scores = dict.fromkeys(graph.nodes(), initial_value)
|
||||||
|
|
||||||
|
iteration_quantity = 0
|
||||||
|
for iteration_number in range(100):
|
||||||
|
iteration_quantity += 1
|
||||||
|
convergence_achieved = 0
|
||||||
|
for i in graph.nodes():
|
||||||
|
rank = 1 - damping
|
||||||
|
for j in graph.neighbors(i):
|
||||||
|
neighbors_sum = sum(graph.edge_weight((j, k)) for k in graph.neighbors(j))
|
||||||
|
rank += damping * scores[j] * graph.edge_weight((j, i)) / neighbors_sum
|
||||||
|
|
||||||
|
if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
|
||||||
|
convergence_achieved += 1
|
||||||
|
|
||||||
|
scores[i] = rank
|
||||||
|
|
||||||
|
if convergence_achieved == len(graph.nodes()):
|
||||||
|
break
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def pagerank_weighted_scipy(graph, damping=0.85):
|
||||||
|
adjacency_matrix = build_adjacency_matrix(graph)
|
||||||
|
probability_matrix = build_probability_matrix(graph)
|
||||||
|
|
||||||
|
# Suppress deprecation warnings from numpy.
|
||||||
|
# See https://github.com/summanlp/textrank/issues/57
|
||||||
|
import warnings
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
from numpy import VisibleDeprecationWarning
|
||||||
|
warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
|
||||||
|
pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
|
||||||
|
|
||||||
|
vals, vecs = eig(pagerank_matrix, left=True, right=False)
|
||||||
|
return process_results(graph, vecs)
|
||||||
|
|
||||||
|
|
||||||
|
def build_adjacency_matrix(graph):
|
||||||
|
row = []
|
||||||
|
col = []
|
||||||
|
data = []
|
||||||
|
nodes = graph.nodes()
|
||||||
|
length = len(nodes)
|
||||||
|
|
||||||
|
for i in range(length):
|
||||||
|
current_node = nodes[i]
|
||||||
|
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
|
||||||
|
for j in range(length):
|
||||||
|
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
|
||||||
|
if i != j and edge_weight != 0:
|
||||||
|
row.append(i)
|
||||||
|
col.append(j)
|
||||||
|
data.append(edge_weight / neighbors_sum)
|
||||||
|
|
||||||
|
return csr_matrix((data,(row,col)), shape=(length,length))
|
||||||
|
|
||||||
|
|
||||||
|
def build_probability_matrix(graph):
|
||||||
|
dimension = len(graph.nodes())
|
||||||
|
matrix = empty_matrix((dimension,dimension))
|
||||||
|
|
||||||
|
probability = 1 / float(dimension)
|
||||||
|
matrix.fill(probability)
|
||||||
|
|
||||||
|
return matrix
|
||||||
|
|
||||||
|
|
||||||
|
def process_results(graph, vecs):
|
||||||
|
scores = {}
|
||||||
|
for i, node in enumerate(graph.nodes()):
|
||||||
|
scores[node] = abs(vecs[i][0])
|
||||||
|
|
||||||
|
return scores
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,635 @@
|
|||||||
|
# Adapted from the NLTK package v3.0.1:
|
||||||
|
# https://github.com/nltk/nltk/blob/3.0.1/nltk/stem/porter.py
|
||||||
|
|
||||||
|
# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org). All rights reserved.
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU General Public License as
|
||||||
|
# published by the Free Software Foundation; either version 2 of the
|
||||||
|
# License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
||||||
|
# USA
|
||||||
|
#
|
||||||
|
# This software is maintained by Vivake (vivakeATomniscia.org) and is available at:
|
||||||
|
# http://www.omniscia.org/~vivake/python/PorterStemmer.py
|
||||||
|
#
|
||||||
|
# Additional modifications were made to incorporate this module into
|
||||||
|
# NLTK. All such modifications are marked with "--NLTK--". The NLTK
|
||||||
|
# version of this module is maintained by NLTK developers,
|
||||||
|
# and is available via http://nltk.org/
|
||||||
|
#
|
||||||
|
# GNU Linking Exception:
|
||||||
|
# Using this module statically or dynamically with other modules is
|
||||||
|
# making a combined work based on this module. Thus, the terms and
|
||||||
|
# conditions of the GNU General Public License cover the whole combination.
|
||||||
|
# As a special exception, the copyright holders of this module give
|
||||||
|
# you permission to combine this module with independent modules to
|
||||||
|
# produce an executable program, regardless of the license terms of these
|
||||||
|
# independent modules, and to copy and distribute the resulting
|
||||||
|
# program under terms of your choice, provided that you also meet,
|
||||||
|
# for each linked independent module, the terms and conditions of
|
||||||
|
# the license of that module. An independent module is a module which
|
||||||
|
# is not derived from or based on this module. If you modify this module,
|
||||||
|
# you may extend this exception to your version of the module, but you
|
||||||
|
# are not obliged to do so. If you do not wish to do so, delete this
|
||||||
|
# exception statement from your version.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Porter Stemmer
|
||||||
|
|
||||||
|
This is the Porter stemming algorithm, ported to Python from the
|
||||||
|
version coded up in ANSI C by the author. It follows the algorithm
|
||||||
|
presented in
|
||||||
|
|
||||||
|
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
|
||||||
|
|
||||||
|
only differing from it at the points marked --DEPARTURE-- and --NEW--
|
||||||
|
below.
|
||||||
|
|
||||||
|
For a more faithful version of the Porter algorithm, see
|
||||||
|
|
||||||
|
http://www.tartarus.org/~martin/PorterStemmer/
|
||||||
|
|
||||||
|
Later additions:
|
||||||
|
|
||||||
|
June 2000
|
||||||
|
|
||||||
|
The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
|
||||||
|
short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
|
||||||
|
|
||||||
|
This follows a suggestion of Barry Wilkins, research student at
|
||||||
|
Birmingham.
|
||||||
|
|
||||||
|
|
||||||
|
February 2000
|
||||||
|
|
||||||
|
the cvc test for not dropping final -e now looks after vc at the
|
||||||
|
beginning of a word, so are, eve, ice, ore, use keep final -e. In this
|
||||||
|
test c is any consonant, including w, x and y. This extension was
|
||||||
|
suggested by Chris Emerson.
|
||||||
|
|
||||||
|
-fully -> -ful treated like -fulness -> -ful, and
|
||||||
|
-tionally -> -tion treated like -tional -> -tion
|
||||||
|
|
||||||
|
both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
|
||||||
|
|
||||||
|
Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
|
||||||
|
|
||||||
|
Additional modifications were made to incorperate this module into
|
||||||
|
nltk. All such modifications are marked with \"--NLTK--\". The nltk
|
||||||
|
version of this module is maintained by the NLTK developers, and is
|
||||||
|
available from <http://nltk.sourceforge.net>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
## --NLTK--
|
||||||
|
## Declare this module's documentation format.
|
||||||
|
|
||||||
|
class PorterStemmer():
|
||||||
|
|
||||||
|
## --NLTK--
|
||||||
|
## Add a module docstring
|
||||||
|
"""
|
||||||
|
A word stemmer based on the Porter stemming algorithm.
|
||||||
|
|
||||||
|
Porter, M. \"An algorithm for suffix stripping.\"
|
||||||
|
Program 14.3 (1980): 130-137.
|
||||||
|
|
||||||
|
A few minor modifications have been made to Porter's basic
|
||||||
|
algorithm. See the source code of this module for more
|
||||||
|
information.
|
||||||
|
|
||||||
|
The Porter Stemmer requires that all tokens have string types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# The main part of the stemming algorithm starts here.
|
||||||
|
# Note that only lower case sequences are stemmed. Forcing to lower case
|
||||||
|
# should be done before stem(...) is called.
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
## --NEW--
|
||||||
|
## This is a table of irregular forms. It is quite short, but still
|
||||||
|
## reflects the errors actually drawn to Martin Porter's attention over
|
||||||
|
## a 20 year period!
|
||||||
|
##
|
||||||
|
## Extend it as necessary.
|
||||||
|
##
|
||||||
|
## The form of the table is:
|
||||||
|
## {
|
||||||
|
## "p1" : ["s11","s12","s13", ... ],
|
||||||
|
## "p2" : ["s21","s22","s23", ... ],
|
||||||
|
## ...
|
||||||
|
## "pn" : ["sn1","sn2","sn3", ... ]
|
||||||
|
## }
|
||||||
|
##
|
||||||
|
## String sij is mapped to paradigm form pi, and the main stemming
|
||||||
|
## process is then bypassed.
|
||||||
|
|
||||||
|
irregular_forms = {
|
||||||
|
"sky" : ["sky", "skies"],
|
||||||
|
"die" : ["dying"],
|
||||||
|
"lie" : ["lying"],
|
||||||
|
"tie" : ["tying"],
|
||||||
|
"news" : ["news"],
|
||||||
|
"inning" : ["innings", "inning"],
|
||||||
|
"outing" : ["outings", "outing"],
|
||||||
|
"canning" : ["cannings", "canning"],
|
||||||
|
"howe" : ["howe"],
|
||||||
|
|
||||||
|
# --NEW--
|
||||||
|
"proceed" : ["proceed"],
|
||||||
|
"exceed" : ["exceed"],
|
||||||
|
"succeed" : ["succeed"], # Hiranmay Ghosh
|
||||||
|
}
|
||||||
|
|
||||||
|
self.pool = {}
|
||||||
|
for key in irregular_forms:
|
||||||
|
for val in irregular_forms[key]:
|
||||||
|
self.pool[val] = key
|
||||||
|
|
||||||
|
self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
|
||||||
|
|
||||||
|
def _cons(self, word, i):
|
||||||
|
"""cons(i) is TRUE <=> b[i] is a consonant."""
|
||||||
|
if word[i] in self.vowels:
|
||||||
|
return False
|
||||||
|
if word[i] == 'y':
|
||||||
|
if i == 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return (not self._cons(word, i - 1))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _m(self, word, j):
|
||||||
|
"""m() measures the number of consonant sequences between k0 and j.
|
||||||
|
if c is a consonant sequence and v a vowel sequence, and <..>
|
||||||
|
indicates arbitrary presence,
|
||||||
|
|
||||||
|
<c><v> gives 0
|
||||||
|
<c>vc<v> gives 1
|
||||||
|
<c>vcvc<v> gives 2
|
||||||
|
<c>vcvcvc<v> gives 3
|
||||||
|
....
|
||||||
|
"""
|
||||||
|
n = 0
|
||||||
|
i = 0
|
||||||
|
while True:
|
||||||
|
if i > j:
|
||||||
|
return n
|
||||||
|
if not self._cons(word, i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
while True:
|
||||||
|
if i > j:
|
||||||
|
return n
|
||||||
|
if self._cons(word, i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
n = n + 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if i > j:
|
||||||
|
return n
|
||||||
|
if not self._cons(word, i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
def _vowelinstem(self, stem):
|
||||||
|
"""vowelinstem(stem) is TRUE <=> stem contains a vowel"""
|
||||||
|
for i in range(len(stem)):
|
||||||
|
if not self._cons(stem, i):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _doublec(self, word):
|
||||||
|
"""doublec(word) is TRUE <=> word ends with a double consonant"""
|
||||||
|
if len(word) < 2:
|
||||||
|
return False
|
||||||
|
if (word[-1] != word[-2]):
|
||||||
|
return False
|
||||||
|
return self._cons(word, len(word)-1)
|
||||||
|
|
||||||
|
def _cvc(self, word, i):
|
||||||
|
"""cvc(i) is TRUE <=>
|
||||||
|
|
||||||
|
a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or
|
||||||
|
|
||||||
|
b) word[i - 2], word[i - 1], word[i] has the form consonant -
|
||||||
|
vowel - consonant and also if the second c is not w, x or y. this
|
||||||
|
is used when trying to restore an e at the end of a short word.
|
||||||
|
e.g.
|
||||||
|
|
||||||
|
cav(e), lov(e), hop(e), crim(e), but
|
||||||
|
snow, box, tray.
|
||||||
|
"""
|
||||||
|
if i == 0: return False # i == 0 never happens perhaps
|
||||||
|
if i == 1: return (not self._cons(word, 0) and self._cons(word, 1))
|
||||||
|
if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False
|
||||||
|
|
||||||
|
ch = word[i]
|
||||||
|
if ch == 'w' or ch == 'x' or ch == 'y':
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _step1ab(self, word):
|
||||||
|
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
|
||||||
|
|
||||||
|
caresses -> caress
|
||||||
|
ponies -> poni
|
||||||
|
sties -> sti
|
||||||
|
tie -> tie (--NEW--: see below)
|
||||||
|
caress -> caress
|
||||||
|
cats -> cat
|
||||||
|
|
||||||
|
feed -> feed
|
||||||
|
agreed -> agree
|
||||||
|
disabled -> disable
|
||||||
|
|
||||||
|
matting -> mat
|
||||||
|
mating -> mate
|
||||||
|
meeting -> meet
|
||||||
|
milling -> mill
|
||||||
|
messing -> mess
|
||||||
|
|
||||||
|
meetings -> meet
|
||||||
|
"""
|
||||||
|
if word[-1] == 's':
|
||||||
|
if word.endswith("sses"):
|
||||||
|
word = word[:-2]
|
||||||
|
elif word.endswith("ies"):
|
||||||
|
if len(word) == 4:
|
||||||
|
word = word[:-1]
|
||||||
|
# this line extends the original algorithm, so that
|
||||||
|
# 'flies'->'fli' but 'dies'->'die' etc
|
||||||
|
else:
|
||||||
|
word = word[:-2]
|
||||||
|
elif word[-2] != 's':
|
||||||
|
word = word[:-1]
|
||||||
|
|
||||||
|
ed_or_ing_trimmed = False
|
||||||
|
if word.endswith("ied"):
|
||||||
|
if len(word) == 4:
|
||||||
|
word = word[:-1]
|
||||||
|
else:
|
||||||
|
word = word[:-2]
|
||||||
|
# this line extends the original algorithm, so that
|
||||||
|
# 'spied'->'spi' but 'died'->'die' etc
|
||||||
|
|
||||||
|
elif word.endswith("eed"):
|
||||||
|
if self._m(word, len(word)-4) > 0:
|
||||||
|
word = word[:-1]
|
||||||
|
|
||||||
|
|
||||||
|
elif word.endswith("ed") and self._vowelinstem(word[:-2]):
|
||||||
|
word = word[:-2]
|
||||||
|
ed_or_ing_trimmed = True
|
||||||
|
elif word.endswith("ing") and self._vowelinstem(word[:-3]):
|
||||||
|
word = word[:-3]
|
||||||
|
ed_or_ing_trimmed = True
|
||||||
|
|
||||||
|
if ed_or_ing_trimmed:
|
||||||
|
if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
|
||||||
|
word += 'e'
|
||||||
|
elif self._doublec(word):
|
||||||
|
if word[-1] not in ['l', 's', 'z']:
|
||||||
|
word = word[:-1]
|
||||||
|
elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)):
|
||||||
|
word += 'e'
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _step1c(self, word):
|
||||||
|
"""step1c() turns terminal y to i when there is another vowel in the stem.
|
||||||
|
--NEW--: This has been modified from the original Porter algorithm so that y->i
|
||||||
|
is only done when y is preceded by a consonant, but not if the stem
|
||||||
|
is only a single consonant, i.e.
|
||||||
|
|
||||||
|
(*c and not c) Y -> I
|
||||||
|
|
||||||
|
So 'happy' -> 'happi', but
|
||||||
|
'enjoy' -> 'enjoy' etc
|
||||||
|
|
||||||
|
This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
|
||||||
|
'enjoy'. Step 1c is perhaps done too soon; but with this modification that
|
||||||
|
no longer really matters.
|
||||||
|
|
||||||
|
Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
|
||||||
|
'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
|
||||||
|
'flies' ...
|
||||||
|
"""
|
||||||
|
if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2):
|
||||||
|
return word[:-1] + 'i'
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _step2(self, word):
|
||||||
|
"""step2() maps double suffices to single ones.
|
||||||
|
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
|
||||||
|
string before the suffix must give m() > 0.
|
||||||
|
"""
|
||||||
|
if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
|
||||||
|
return word
|
||||||
|
|
||||||
|
ch = word[-2]
|
||||||
|
|
||||||
|
if ch == 'a':
|
||||||
|
if word.endswith("ational"):
|
||||||
|
return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word
|
||||||
|
elif word.endswith("tional"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-7) > 0 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'c':
|
||||||
|
if word.endswith("enci"):
|
||||||
|
return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word
|
||||||
|
elif word.endswith("anci"):
|
||||||
|
return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'e':
|
||||||
|
if word.endswith("izer"):
|
||||||
|
return word[:-1] if self._m(word, len(word)-5) > 0 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'l':
|
||||||
|
if word.endswith("bli"):
|
||||||
|
return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE--
|
||||||
|
# To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
|
||||||
|
elif word.endswith("alli"):
|
||||||
|
# --NEW--
|
||||||
|
if self._m(word, len(word)-5) > 0:
|
||||||
|
word = word[:-2]
|
||||||
|
return self._step2(word)
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif word.endswith("fulli"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-6) else word # --NEW--
|
||||||
|
elif word.endswith("entli"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("eli"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-4) else word
|
||||||
|
elif word.endswith("ousli"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-6) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'o':
|
||||||
|
if word.endswith("ization"):
|
||||||
|
return word[:-7] + "ize" if self._m(word, len(word)-8) else word
|
||||||
|
elif word.endswith("ation"):
|
||||||
|
return word[:-5] + "ate" if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("ator"):
|
||||||
|
return word[:-4] + "ate" if self._m(word, len(word)-5) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 's':
|
||||||
|
if word.endswith("alism"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("ness"):
|
||||||
|
if word.endswith("iveness"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-8) else word
|
||||||
|
elif word.endswith("fulness"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-8) else word
|
||||||
|
elif word.endswith("ousness"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-8) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 't':
|
||||||
|
if word.endswith("aliti"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("iviti"):
|
||||||
|
return word[:-5] + "ive" if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("biliti"):
|
||||||
|
return word[:-6] + "ble" if self._m(word, len(word)-7) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'g': # --DEPARTURE--
|
||||||
|
if word.endswith("logi"):
|
||||||
|
return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins)
|
||||||
|
# To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _step3(self, word):
|
||||||
|
"""step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
|
||||||
|
|
||||||
|
ch = word[-1]
|
||||||
|
|
||||||
|
if ch == 'e':
|
||||||
|
if word.endswith("icate"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("ative"):
|
||||||
|
return word[:-5] if self._m(word, len(word)-6) else word
|
||||||
|
elif word.endswith("alize"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-6) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'i':
|
||||||
|
if word.endswith("iciti"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-6) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'l':
|
||||||
|
if word.endswith("ical"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-5) else word
|
||||||
|
elif word.endswith("ful"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 's':
|
||||||
|
if word.endswith("ness"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _step4(self, word):
|
||||||
|
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
|
||||||
|
|
||||||
|
if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
|
||||||
|
return word
|
||||||
|
|
||||||
|
ch = word[-2]
|
||||||
|
|
||||||
|
if ch == 'a':
|
||||||
|
if word.endswith("al"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-3) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'c':
|
||||||
|
if word.endswith("ance"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) > 1 else word
|
||||||
|
elif word.endswith("ence"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'e':
|
||||||
|
if word.endswith("er"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-3) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'i':
|
||||||
|
if word.endswith("ic"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-3) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'l':
|
||||||
|
if word.endswith("able"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) > 1 else word
|
||||||
|
elif word.endswith("ible"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'n':
|
||||||
|
if word.endswith("ant"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
elif word.endswith("ement"):
|
||||||
|
return word[:-5] if self._m(word, len(word)-6) > 1 else word
|
||||||
|
elif word.endswith("ment"):
|
||||||
|
return word[:-4] if self._m(word, len(word)-5) > 1 else word
|
||||||
|
elif word.endswith("ent"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'o':
|
||||||
|
if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
elif word.endswith("ou"):
|
||||||
|
return word[:-2] if self._m(word, len(word)-3) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 's':
|
||||||
|
if word.endswith("ism"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 't':
|
||||||
|
if word.endswith("ate"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
elif word.endswith("iti"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'u':
|
||||||
|
if word.endswith("ous"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'v':
|
||||||
|
if word.endswith("ive"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
elif ch == 'z':
|
||||||
|
if word.endswith("ize"):
|
||||||
|
return word[:-3] if self._m(word, len(word)-4) > 1 else word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
else:
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _step5(self, word):
|
||||||
|
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
|
||||||
|
m() > 1.
|
||||||
|
"""
|
||||||
|
if word[-1] == 'e':
|
||||||
|
a = self._m(word, len(word)-1)
|
||||||
|
if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)):
|
||||||
|
word = word[:-1]
|
||||||
|
if word.endswith('ll') and self._m(word, len(word)-1) > 1:
|
||||||
|
word = word[:-1]
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
def stem_word(self, p, i=0, j=None):
|
||||||
|
"""
|
||||||
|
Returns the stem of p, or, if i and j are given, the stem of p[i:j+1].
|
||||||
|
"""
|
||||||
|
## --NLTK--
|
||||||
|
if j is None and i == 0:
|
||||||
|
word = p
|
||||||
|
else:
|
||||||
|
if j is None:
|
||||||
|
j = len(p) - 1
|
||||||
|
word = p[i:j+1]
|
||||||
|
|
||||||
|
if word in self.pool:
|
||||||
|
return self.pool[word]
|
||||||
|
|
||||||
|
if len(word) <= 2:
|
||||||
|
return word # --DEPARTURE--
|
||||||
|
# With this line, strings of length 1 or 2 don't go through the
|
||||||
|
# stemming process, although no mention is made of this in the
|
||||||
|
# published algorithm. Remove the line to match the published
|
||||||
|
# algorithm.
|
||||||
|
|
||||||
|
word = self._step1ab(word)
|
||||||
|
word = self._step1c(word)
|
||||||
|
word = self._step2(word)
|
||||||
|
word = self._step3(word)
|
||||||
|
word = self._step4(word)
|
||||||
|
word = self._step5(word)
|
||||||
|
return word
|
||||||
|
|
||||||
|
def _adjust_case(self, word, stem):
|
||||||
|
lower = word.lower()
|
||||||
|
|
||||||
|
ret = ""
|
||||||
|
for x in range(len(stem)):
|
||||||
|
if lower[x] == stem[x]:
|
||||||
|
ret += word[x]
|
||||||
|
else:
|
||||||
|
ret += stem[x]
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
## --NLTK--
|
||||||
|
## Don't use this procedure; we want to work with individual
|
||||||
|
## tokens, instead. (commented out the following procedure)
|
||||||
|
#def stem(self, text):
|
||||||
|
# parts = re.split("(\W+)", text)
|
||||||
|
# numWords = (len(parts) + 1)/2
|
||||||
|
#
|
||||||
|
# ret = ""
|
||||||
|
# for i in xrange(numWords):
|
||||||
|
# word = parts[2 * i]
|
||||||
|
# separator = ""
|
||||||
|
# if ((2 * i) + 1) < len(parts):
|
||||||
|
# separator = parts[(2 * i) + 1]
|
||||||
|
#
|
||||||
|
# stem = self.stem_word(string.lower(word), 0, len(word) - 1)
|
||||||
|
# ret = ret + self.adjust_case(word, stem)
|
||||||
|
# ret = ret + separator
|
||||||
|
# return ret
|
||||||
|
|
||||||
|
## --NLTK--
|
||||||
|
## Define a stem() method that implements the StemmerI interface.
|
||||||
|
def stem(self, word):
|
||||||
|
stem = self.stem_word(word.lower(), 0, len(word) - 1)
|
||||||
|
return self._adjust_case(word, stem)
|
||||||
|
|
||||||
|
## --NLTK--
|
||||||
|
## Add a string representation function
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PorterStemmer>'
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,188 @@
|
|||||||
|
import string
|
||||||
|
import unicodedata
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger('summa.preprocessing.cleaner')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pattern.en import tag
|
||||||
|
logger.info("'pattern' package found; tag filters are available for English")
|
||||||
|
HAS_PATTERN = True
|
||||||
|
except ImportError:
|
||||||
|
logger.info("'pattern' package not found; tag filters are not available for English")
|
||||||
|
HAS_PATTERN = False
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .snowball import SnowballStemmer
|
||||||
|
from .stopwords import get_stopwords_by_language
|
||||||
|
from summa.syntactic_unit import SyntacticUnit
|
||||||
|
|
||||||
|
|
||||||
|
# Utility functions adapted from Gensim v0.10.0:
|
||||||
|
# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/utils.py
|
||||||
|
# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/parsing/preprocessing.py
|
||||||
|
|
||||||
|
|
||||||
|
SEPARATOR = r"@"
|
||||||
|
RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)')
|
||||||
|
AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)")
|
||||||
|
AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)")
|
||||||
|
AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.")
|
||||||
|
UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)")
|
||||||
|
UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)")
|
||||||
|
|
||||||
|
STEMMER = None
|
||||||
|
STOPWORDS = None
|
||||||
|
|
||||||
|
|
||||||
|
def set_stemmer_language(language):
|
||||||
|
global STEMMER
|
||||||
|
if not language in SnowballStemmer.languages:
|
||||||
|
raise ValueError("Valid languages are: " + ", ".join(sorted(SnowballStemmer.languages)))
|
||||||
|
STEMMER = SnowballStemmer(language)
|
||||||
|
|
||||||
|
|
||||||
|
def set_stopwords_by_language(language, additional_stopwords):
|
||||||
|
global STOPWORDS
|
||||||
|
words = get_stopwords_by_language(language)
|
||||||
|
if not additional_stopwords:
|
||||||
|
additional_stopwords = {}
|
||||||
|
STOPWORDS = frozenset({ w for w in words.split() if w } | { w for w in additional_stopwords if w })
|
||||||
|
|
||||||
|
|
||||||
|
def init_textcleanner(language, additional_stopwords):
|
||||||
|
set_stemmer_language(language)
|
||||||
|
set_stopwords_by_language(language, additional_stopwords)
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentences(text):
|
||||||
|
processed = replace_abbreviations(text)
|
||||||
|
return [undo_replacement(sentence) for sentence in get_sentences(processed)]
|
||||||
|
|
||||||
|
|
||||||
|
def replace_abbreviations(text):
|
||||||
|
return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
|
||||||
|
|
||||||
|
|
||||||
|
def undo_replacement(sentence):
|
||||||
|
return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
|
||||||
|
|
||||||
|
|
||||||
|
def replace_with_separator(text, separator, regexs):
|
||||||
|
replacement = r"\1" + separator + r"\2"
|
||||||
|
result = text
|
||||||
|
for regex in regexs:
|
||||||
|
result = regex.sub(replacement, result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_sentences(text):
|
||||||
|
for match in RE_SENTENCE.finditer(text):
|
||||||
|
yield match.group()
|
||||||
|
|
||||||
|
|
||||||
|
# Taken from Gensim
|
||||||
|
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
|
||||||
|
def strip_punctuation(s):
|
||||||
|
return RE_PUNCT.sub(" ", s)
|
||||||
|
|
||||||
|
|
||||||
|
# Taken from Gensim
|
||||||
|
RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
|
||||||
|
def strip_numeric(s):
|
||||||
|
return RE_NUMERIC.sub("", s)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_stopwords(sentence):
|
||||||
|
return " ".join(w for w in sentence.split() if w not in STOPWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
def stem_sentence(sentence):
|
||||||
|
word_stems = [STEMMER.stem(word) for word in sentence.split()]
|
||||||
|
return " ".join(word_stems)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_filters(sentence, filters):
|
||||||
|
for f in filters:
|
||||||
|
sentence = f(sentence)
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def filter_words(sentences):
|
||||||
|
filters = [lambda x: x.lower(), strip_numeric, strip_punctuation, remove_stopwords,
|
||||||
|
stem_sentence]
|
||||||
|
apply_filters_to_token = lambda token: apply_filters(token, filters)
|
||||||
|
return list(map(apply_filters_to_token, sentences))
|
||||||
|
|
||||||
|
|
||||||
|
# Taken from Gensim
|
||||||
|
def deaccent(text):
|
||||||
|
"""
|
||||||
|
Remove accentuation from the given string.
|
||||||
|
"""
|
||||||
|
norm = unicodedata.normalize("NFD", text)
|
||||||
|
result = "".join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
|
||||||
|
return unicodedata.normalize("NFC", result)
|
||||||
|
|
||||||
|
|
||||||
|
# Taken from Gensim
|
||||||
|
PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
|
||||||
|
def tokenize(text, lowercase=False, deacc=False):
|
||||||
|
"""
|
||||||
|
Iteratively yield tokens as unicode strings, optionally also lowercasing them
|
||||||
|
and removing accent marks.
|
||||||
|
"""
|
||||||
|
if lowercase:
|
||||||
|
text = text.lower()
|
||||||
|
if deacc:
|
||||||
|
text = deaccent(text)
|
||||||
|
for match in PAT_ALPHABETIC.finditer(text):
|
||||||
|
yield match.group()
|
||||||
|
|
||||||
|
|
||||||
|
def merge_syntactic_units(original_units, filtered_units, tags=None):
|
||||||
|
units = []
|
||||||
|
for i in range(len(original_units)):
|
||||||
|
if filtered_units[i] == '':
|
||||||
|
continue
|
||||||
|
|
||||||
|
text = original_units[i]
|
||||||
|
token = filtered_units[i]
|
||||||
|
tag = tags[i][1] if tags else None
|
||||||
|
sentence = SyntacticUnit(text, token, tag)
|
||||||
|
sentence.index = i
|
||||||
|
|
||||||
|
units.append(sentence)
|
||||||
|
|
||||||
|
return units
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text_by_sentences(text, language="english", additional_stopwords=None):
|
||||||
|
""" Tokenizes a given text into sentences, applying filters and lemmatizing them.
|
||||||
|
Returns a SyntacticUnit list. """
|
||||||
|
init_textcleanner(language, additional_stopwords)
|
||||||
|
original_sentences = split_sentences(text)
|
||||||
|
filtered_sentences = filter_words(original_sentences)
|
||||||
|
|
||||||
|
return merge_syntactic_units(original_sentences, filtered_sentences)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text_by_word(text, language="english", deacc=False, additional_stopwords=None):
|
||||||
|
""" Tokenizes a given text into words, applying filters and lemmatizing them.
|
||||||
|
Returns a dict of word -> syntacticUnit. """
|
||||||
|
init_textcleanner(language, additional_stopwords)
|
||||||
|
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
|
||||||
|
original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc))
|
||||||
|
filtered_words = filter_words(original_words)
|
||||||
|
if HAS_PATTERN:
|
||||||
|
tags = tag(" ".join(original_words)) # tag needs the context of the words in the text
|
||||||
|
else:
|
||||||
|
tags = None
|
||||||
|
units = merge_syntactic_units(original_words, filtered_words, tags)
|
||||||
|
return { unit.text : unit for unit in units }
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_by_word(text, deacc=False):
|
||||||
|
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
|
||||||
|
return tokenize(text_without_acronyms, lowercase=True, deacc=deacc)
|
@ -0,0 +1,24 @@
|
|||||||
|
# Natural Language Toolkit: Stemmer Utilities
|
||||||
|
#
|
||||||
|
# Copyright (C) 2001-2019 NLTK Project
|
||||||
|
# Author: Helder <he7d3r@gmail.com>
|
||||||
|
# URL: <http://nltk.org/>
|
||||||
|
# For license information, see LICENSE.TXT
|
||||||
|
|
||||||
|
|
||||||
|
def suffix_replace(original, old, new):
|
||||||
|
"""
|
||||||
|
Replaces the old suffix of the original string by a new suffix
|
||||||
|
"""
|
||||||
|
return original[: -len(old)] + new
|
||||||
|
|
||||||
|
|
||||||
|
def prefix_replace(original, old, new):
|
||||||
|
"""
|
||||||
|
Replaces the old prefix of the original string by a new suffix
|
||||||
|
:param original: string
|
||||||
|
:param old: string
|
||||||
|
:param new: string
|
||||||
|
:return: string
|
||||||
|
"""
|
||||||
|
return new + original[len(old) :]
|
@ -0,0 +1,154 @@
|
|||||||
|
from math import log10
|
||||||
|
|
||||||
|
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
|
||||||
|
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
|
||||||
|
from .commons import build_graph as _build_graph
|
||||||
|
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
||||||
|
|
||||||
|
|
||||||
|
def _set_graph_edge_weights(graph):
|
||||||
|
for sentence_1 in graph.nodes():
|
||||||
|
for sentence_2 in graph.nodes():
|
||||||
|
|
||||||
|
edge = (sentence_1, sentence_2)
|
||||||
|
if sentence_1 != sentence_2 and not graph.has_edge(edge):
|
||||||
|
similarity = _get_similarity(sentence_1, sentence_2)
|
||||||
|
if similarity != 0:
|
||||||
|
graph.add_edge(edge, similarity)
|
||||||
|
|
||||||
|
# Handles the case in which all similarities are zero.
|
||||||
|
# The resultant summary will consist of random sentences.
|
||||||
|
if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
|
||||||
|
_create_valid_graph(graph)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_valid_graph(graph):
|
||||||
|
nodes = graph.nodes()
|
||||||
|
|
||||||
|
for i in range(len(nodes)):
|
||||||
|
for j in range(len(nodes)):
|
||||||
|
if i == j:
|
||||||
|
continue
|
||||||
|
|
||||||
|
edge = (nodes[i], nodes[j])
|
||||||
|
|
||||||
|
if graph.has_edge(edge):
|
||||||
|
graph.del_edge(edge)
|
||||||
|
|
||||||
|
graph.add_edge(edge, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_similarity(s1, s2):
|
||||||
|
words_sentence_one = s1.split()
|
||||||
|
words_sentence_two = s2.split()
|
||||||
|
|
||||||
|
common_word_count = _count_common_words(words_sentence_one, words_sentence_two)
|
||||||
|
|
||||||
|
log_s1 = log10(len(words_sentence_one))
|
||||||
|
log_s2 = log10(len(words_sentence_two))
|
||||||
|
|
||||||
|
if log_s1 + log_s2 == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
return common_word_count / (log_s1 + log_s2)
|
||||||
|
|
||||||
|
|
||||||
|
def _count_common_words(words_sentence_one, words_sentence_two):
|
||||||
|
return len(set(words_sentence_one) & set(words_sentence_two))
|
||||||
|
|
||||||
|
|
||||||
|
def _format_results(extracted_sentences, split, score):
|
||||||
|
if score:
|
||||||
|
return [(sentence.text, sentence.score) for sentence in extracted_sentences]
|
||||||
|
if split:
|
||||||
|
return [sentence.text for sentence in extracted_sentences]
|
||||||
|
return "\n".join([sentence.text for sentence in extracted_sentences])
|
||||||
|
|
||||||
|
|
||||||
|
def _add_scores_to_sentences(sentences, scores):
|
||||||
|
for sentence in sentences:
|
||||||
|
# Adds the score to the object if it has one.
|
||||||
|
if sentence.token in scores:
|
||||||
|
sentence.score = scores[sentence.token]
|
||||||
|
else:
|
||||||
|
sentence.score = 0
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sentences_with_word_count(sentences, words):
|
||||||
|
""" Given a list of sentences, returns a list of sentences with a
|
||||||
|
total word count similar to the word count provided.
|
||||||
|
"""
|
||||||
|
word_count = 0
|
||||||
|
selected_sentences = []
|
||||||
|
# Loops until the word count is reached.
|
||||||
|
for sentence in sentences:
|
||||||
|
words_in_sentence = len(sentence.text.split())
|
||||||
|
|
||||||
|
# Checks if the inclusion of the sentence gives a better approximation
|
||||||
|
# to the word parameter.
|
||||||
|
if abs(words - word_count - words_in_sentence) > abs(words - word_count):
|
||||||
|
return selected_sentences
|
||||||
|
|
||||||
|
selected_sentences.append(sentence)
|
||||||
|
word_count += words_in_sentence
|
||||||
|
|
||||||
|
return selected_sentences
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_most_important_sentences(sentences, ratio, words):
|
||||||
|
sentences.sort(key=lambda s: s.score, reverse=True)
|
||||||
|
|
||||||
|
# If no "words" option is selected, the number of sentences is
|
||||||
|
# reduced by the provided ratio.
|
||||||
|
if words is None:
|
||||||
|
length = len(sentences) * ratio
|
||||||
|
return sentences[:int(length)]
|
||||||
|
|
||||||
|
# Else, the ratio is ignored.
|
||||||
|
else:
|
||||||
|
return _get_sentences_with_word_count(sentences, words)
|
||||||
|
|
||||||
|
|
||||||
|
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
||||||
|
|
||||||
|
# Gets a list of processed sentences.
|
||||||
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
||||||
|
|
||||||
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
||||||
|
graph = _build_graph([sentence.token for sentence in sentences])
|
||||||
|
_set_graph_edge_weights(graph)
|
||||||
|
|
||||||
|
# Remove all nodes with all edges weights equal to zero.
|
||||||
|
_remove_unreachable_nodes(graph)
|
||||||
|
|
||||||
|
# PageRank cannot be run in an empty graph.
|
||||||
|
if len(graph.nodes()) == 0:
|
||||||
|
return [] if split else ""
|
||||||
|
|
||||||
|
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
|
||||||
|
pagerank_scores = _pagerank(graph)
|
||||||
|
|
||||||
|
# Adds the summa scores to the sentence objects.
|
||||||
|
_add_scores_to_sentences(sentences, pagerank_scores)
|
||||||
|
|
||||||
|
# EDIT: return the whole sentences with scores
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
# Extracts the most important sentences with the selected criterion.
|
||||||
|
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
|
||||||
|
|
||||||
|
# Sorts the extracted sentences by apparition order in the original text.
|
||||||
|
# extracted_sentences.sort(key=lambda s: s.index)
|
||||||
|
|
||||||
|
# return _format_results(extracted_sentences, split, scores)
|
||||||
|
|
||||||
|
|
||||||
|
def get_graph(text, language="english"):
|
||||||
|
sentences = _clean_text_by_sentences(text, language)
|
||||||
|
|
||||||
|
graph = _build_graph([sentence.token for sentence in sentences])
|
||||||
|
_set_graph_edge_weights(graph)
|
||||||
|
|
||||||
|
return graph
|
@ -0,0 +1,14 @@
|
|||||||
|
class SyntacticUnit(object):
|
||||||
|
|
||||||
|
def __init__(self, text, token=None, tag=None):
|
||||||
|
self.text = text
|
||||||
|
self.token = token
|
||||||
|
self.tag = tag[:2] if tag else None # just first two letters of tag
|
||||||
|
self.index = -1
|
||||||
|
self.score = -1
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
@ -0,0 +1,97 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .summarizer import summarize
|
||||||
|
from .keywords import keywords
|
||||||
|
|
||||||
|
# Types of summarization
|
||||||
|
SENTENCE = 0
|
||||||
|
WORD = 1
|
||||||
|
|
||||||
|
DEFAULT_RATIO = 0.2
|
||||||
|
|
||||||
|
|
||||||
|
def textrank(text, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None):
|
||||||
|
if summarize_by == SENTENCE:
|
||||||
|
return summarize(text, ratio, words, additional_stopwords=additional_stopwords)
|
||||||
|
else:
|
||||||
|
return keywords(text, ratio, words, additional_stopwords=additional_stopwords)
|
||||||
|
|
||||||
|
|
||||||
|
def existing_file(file_name):
|
||||||
|
try:
|
||||||
|
with open(file_name, 'r') as file:
|
||||||
|
return file.read()
|
||||||
|
except Exception:
|
||||||
|
raise argparse.ArgumentTypeError("The file provided could not be opened.")
|
||||||
|
|
||||||
|
|
||||||
|
def restricted_float(x):
|
||||||
|
x = float(x)
|
||||||
|
if x < 0.0 or x > 1.0:
|
||||||
|
raise argparse.ArgumentTypeError("{} not in range [0.0, 1.0]".format(x))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(args):
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="textrank", description="Extract the most relevant sentences or keywords of a given text using the TextRank algorithm.")
|
||||||
|
|
||||||
|
group = parser.add_mutually_exclusive_group(required=True)
|
||||||
|
# New API
|
||||||
|
group.add_argument('--summarize', metavar="path/to/file", type=existing_file,
|
||||||
|
help="Run textrank to summarize the input text.")
|
||||||
|
group.add_argument('--keywords', metavar="path/to/file", type=existing_file,
|
||||||
|
help="Run textrank to extract keywords from the input text.")
|
||||||
|
# Old API
|
||||||
|
group.add_argument('--text', '-t', metavar="path/to/file", type=existing_file,
|
||||||
|
help="(Deprecated) Text to summarize if --summary option is selected")
|
||||||
|
|
||||||
|
parser.add_argument('--summary', '-s', metavar="{0,1}", type=int, choices=[SENTENCE, WORD], default=0,
|
||||||
|
help="(Deprecated) Type of unit to summarize: sentence (0) or word (1)")
|
||||||
|
parser.add_argument('--ratio', '-r', metavar="r", type=restricted_float, default=DEFAULT_RATIO,
|
||||||
|
help="Float number (0,1] that defines the length of the summary. It's a proportion of the original text")
|
||||||
|
parser.add_argument('--words', '-w', metavar="#words", type=int,
|
||||||
|
help="Number to limit the length of the summary. The length option is ignored if the word limit is set.")
|
||||||
|
parser.add_argument('--additional_stopwords', '-a', metavar="list,of,stopwords",
|
||||||
|
help="Either a string of comma separated stopwords or a path to a file which has comma separated stopwords in every line")
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args(sys.argv[1:])
|
||||||
|
|
||||||
|
mode = None
|
||||||
|
text = None
|
||||||
|
|
||||||
|
if args.summarize:
|
||||||
|
text = args.summarize
|
||||||
|
mode = SENTENCE
|
||||||
|
elif args.keywords:
|
||||||
|
text = args.keywords
|
||||||
|
mode = WORD
|
||||||
|
elif args.summary: # Old api
|
||||||
|
warnings.warn("The --summary option is deprecated. Please use either --summarize or --keywords", DeprecationWarning)
|
||||||
|
text = args.text
|
||||||
|
mode = args.summary
|
||||||
|
|
||||||
|
if text is None:
|
||||||
|
raise argparse.ArgumentTypeError('Error: no text to summarize provided.')
|
||||||
|
else:
|
||||||
|
raise argparse.ArgumentTypeError('Error: --summarize or --keywords is required')
|
||||||
|
|
||||||
|
additional_stopwords = None
|
||||||
|
if args.additional_stopwords:
|
||||||
|
if os.path.exists(args.additional_stopwords):
|
||||||
|
with open(args.additional_stopwords) as f:
|
||||||
|
additional_stopwords = {s for l in f for s in l.strip().split(",")}
|
||||||
|
else:
|
||||||
|
additional_stopwords = args.additional_stopwords.split(",")
|
||||||
|
|
||||||
|
print(textrank(text, mode, args.ratio, args.words, additional_stopwords))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,31 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html lang="fr">
|
||||||
|
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
|
||||||
|
<title>TextRank Opacity</title>
|
||||||
|
<meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
|
||||||
|
|
||||||
|
<link rel="stylesheet" type="text/css" href="css/main.css" />
|
||||||
|
<link rel="stylesheet" type="text/css" href="css/typography.css" />
|
||||||
|
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<main>
|
||||||
|
|
||||||
|
{% for s in sentences %}
|
||||||
|
{{ s.html|safe }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
@ -0,0 +1 @@
|
|||||||
|
A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund.
|
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
:root{
|
||||||
|
--lh: 1.35rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
body{
|
||||||
|
margin: var(--lh);
|
||||||
|
line-height: var(--lh);
|
||||||
|
}
|
||||||
|
|
||||||
|
@media print{
|
||||||
|
body{
|
||||||
|
margin: 0;
|
||||||
|
font-size: 10pt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main{
|
||||||
|
max-width: 42rem;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* h1,h2,h3,h4,h5,h6{
|
||||||
|
line-height: var(--lh);
|
||||||
|
} */
|
||||||
|
|
||||||
|
h1{
|
||||||
|
text-align: center;
|
||||||
|
margin: calc(2 * var(--lh)) 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
h2,h3,h4,h5,h6{
|
||||||
|
margin: calc(3 * var(--lh)) 0 var(--lh);
|
||||||
|
}
|
||||||
|
|
||||||
|
:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){
|
||||||
|
margin-top: var(--lh);
|
||||||
|
}
|
Loading…
Reference in New Issue