|
|
|
from os import remove
|
|
|
|
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
|
|
|
|
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
|
|
|
|
from .commons import build_graph as _build_graph
|
|
|
|
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
|
|
|
from .summarizer import _set_graph_edge_weights
|
|
|
|
from .summarizer import _add_scores_to_sentences
|
|
|
|
from .summarizer import _create_valid_graph
|
|
|
|
|
|
|
|
def count_words(line):
|
|
|
|
return len(line.split(" "))
|
|
|
|
|
|
|
|
def remove_shorts(sentences, words_number):
|
|
|
|
return [s for s in sentences if count_words(s.text) > words_number]
|
|
|
|
|
|
|
|
|
|
|
|
# SCORED SENTENCES (output the list of sentence with their scores)
|
|
|
|
# -------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None):
|
|
|
|
if not isinstance(text, str):
|
|
|
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
|
|
|
|
|
|
|
# Gets a list of processed sentences.
|
|
|
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
|
|
|
|
|
|
|
# Remove sentence with less than words_number words.
|
|
|
|
if words_number:
|
|
|
|
sentences = remove_shorts(sentences, words_number)
|
|
|
|
|
|
|
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
|
|
|
graph = _build_graph([sentence.token for sentence in sentences])
|
|
|
|
_set_graph_edge_weights(graph)
|
|
|
|
|
|
|
|
# Remove all nodes with all edges weights equal to zero.
|
|
|
|
_remove_unreachable_nodes(graph)
|
|
|
|
|
|
|
|
# PageRank cannot be run in an empty graph.
|
|
|
|
if len(graph.nodes()) == 0:
|
|
|
|
return [] if split else ""
|
|
|
|
|
|
|
|
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
|
|
|
|
pagerank_scores = _pagerank(graph)
|
|
|
|
|
|
|
|
# Adds the summa scores to the sentence objects.
|
|
|
|
_add_scores_to_sentences(sentences, pagerank_scores)
|
|
|
|
|
|
|
|
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)
|
|
|
|
# -------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
def similarity_graph(text, words_number=0, language="english", additional_stopwords=None):
|
|
|
|
if not isinstance(text, str):
|
|
|
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
|
|
|
|
|
|
|
# Gets a list of processed sentences.
|
|
|
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
|
|
|
|
|
|
|
# Remove sentence with less than words_number words.
|
|
|
|
if words_number:
|
|
|
|
sentences = remove_shorts(sentences, words_number)
|
|
|
|
|
|
|
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
|
|
|
graph = _build_graph([sentence.token for sentence in sentences])
|
|
|
|
_set_graph_edge_weights(graph)
|
|
|
|
|
|
|
|
return graph
|
|
|
|
|
|
|
|
|
|
|
|
# CUSTOM SUMMARIZATION (self made similarity matrix)
|
|
|
|
# -------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
def set_graph_custom_edge_weights(sentences, graph, matrix):
|
|
|
|
# allow us to input a custom 'hand-made' matrix and feed it to pagerank
|
|
|
|
|
|
|
|
for sentence_1 in graph.nodes():
|
|
|
|
for sentence_2 in graph.nodes():
|
|
|
|
|
|
|
|
s1_text = sentences[sentence_1].text
|
|
|
|
s2_text = sentences[sentence_2].text
|
|
|
|
|
|
|
|
edge = (sentence_1, sentence_2)
|
|
|
|
if sentence_1 != sentence_2 and not graph.has_edge(edge):
|
|
|
|
similarity = matrix[s1_text][s2_text]
|
|
|
|
if similarity != 0:
|
|
|
|
graph.add_edge(edge, similarity)
|
|
|
|
|
|
|
|
# Handles the case in which all similarities are zero.
|
|
|
|
# The resultant summary will consist of random sentences.
|
|
|
|
if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
|
|
|
|
_create_valid_graph(graph)
|
|
|
|
|
|
|
|
|
|
|
|
def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
|
|
|
|
|
|
|
|
if not isinstance(text, str):
|
|
|
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
|
|
|
|
|
|
|
# Gets a list of processed sentences.
|
|
|
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
|
|
|
|
|
|
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
|
|
|
graph = _build_graph([sentence.token for sentence in sentences])
|
|
|
|
set_graph_custom_edge_weights(sentences, graph, matrix)
|
|
|
|
|
|
|
|
# Remove all nodes with all edges weights equal to zero.
|
|
|
|
_remove_unreachable_nodes(graph)
|
|
|
|
|
|
|
|
# PageRank cannot be run in an empty graph.
|
|
|
|
if len(graph.nodes()) == 0:
|
|
|
|
return [] if split else ""
|
|
|
|
|
|
|
|
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
|
|
|
|
pagerank_scores = _pagerank(graph)
|
|
|
|
|
|
|
|
# Adds the summa scores to the sentence objects.
|
|
|
|
_add_scores_to_sentences(sentences, pagerank_scores)
|
|
|
|
|
|
|
|
# EDIT: return the whole sentences with scores
|
|
|
|
return sentences
|
|
|
|
|
|
|
|
# Extracts the most important sentences with the selected criterion.
|
|
|
|
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
|
|
|
|
|
|
|
|
# Sorts the extracted sentences by apparition order in the original text.
|
|
|
|
# extracted_sentences.sort(key=lambda s: s.index)
|
|
|
|
|
|
|
|
# return _format_results(extracted_sentences, split, scores)
|