@ -1,18 +1,33 @@
from os import remove
from . pagerank_weighted import pagerank_weighted_scipy as _pagerank
from . preprocessing . textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from . commons import build_graph as _build_graph
from . commons import remove_unreachable_nodes as _remove_unreachable_nodes
from . summarizer import _set_graph_edge_weights
from . summarizer import _add_scores_to_sentences
from . summarizer import _create_valid_graph
def count_words ( line ) :
return len ( line . split ( " " ) )
def scored_sentences ( text , language = " english " , split = False , additional_stopwords = None ) :
def remove_shorts ( sentences , words_number ) :
return [ s for s in sentences if count_words ( s . text ) > words_number ]
# SCORED SENTENCES (output the list of sentence with their scores)
# -------------------------------------------------------------------------------------
def scored_sentences ( text , words_number = 0 , language = " english " , split = False , additional_stopwords = None ) :
if not isinstance ( text , str ) :
raise ValueError ( " Text parameter must be a Unicode object (str)! " )
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences ( text , language , additional_stopwords )
# Remove sentence with less than words_number words.
if words_number :
sentences = remove_shorts ( sentences , words_number )
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph ( [ sentence . token for sentence in sentences ] )
_set_graph_edge_weights ( graph )
@ -33,15 +48,83 @@ def scored_sentences(text, language="english", split=False, additional_stopwords
return sentences
def similarity_graph ( text , language = " english " , additional_stopwords = None ) :
# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)
# -------------------------------------------------------------------------------------
def similarity_graph ( text , words_number = 0 , language = " english " , additional_stopwords = None ) :
if not isinstance ( text , str ) :
raise ValueError ( " Text parameter must be a Unicode object (str)! " )
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences ( text , language , additional_stopwords )
# Remove sentence with less than words_number words.
if words_number :
sentences = remove_shorts ( sentences , words_number )
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph ( [ sentence . token for sentence in sentences ] )
_set_graph_edge_weights ( graph )
return graph
# CUSTOM SUMMARIZATION (self made similarity matrix)
# -------------------------------------------------------------------------------------
def set_graph_custom_edge_weights ( sentences , graph , matrix ) :
# allow us to input a custom 'hand-made' matrix and feed it to pagerank
for sentence_1 in graph . nodes ( ) :
for sentence_2 in graph . nodes ( ) :
s1_text = sentences [ sentence_1 ] . text
s2_text = sentences [ sentence_2 ] . text
edge = ( sentence_1 , sentence_2 )
if sentence_1 != sentence_2 and not graph . has_edge ( edge ) :
similarity = matrix [ s1_text ] [ s2_text ]
if similarity != 0 :
graph . add_edge ( edge , similarity )
# Handles the case in which all similarities are zero.
# The resultant summary will consist of random sentences.
if all ( graph . edge_weight ( edge ) == 0 for edge in graph . edges ( ) ) :
_create_valid_graph ( graph )
def custom_summarize ( text , matrix , ratio = 0.2 , words = None , language = " english " , split = False , scores = False , additional_stopwords = None ) :
if not isinstance ( text , str ) :
raise ValueError ( " Text parameter must be a Unicode object (str)! " )
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences ( text , language , additional_stopwords )
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph ( [ sentence . token for sentence in sentences ] )
set_graph_custom_edge_weights ( sentences , graph , matrix )
# Remove all nodes with all edges weights equal to zero.
_remove_unreachable_nodes ( graph )
# PageRank cannot be run in an empty graph.
if len ( graph . nodes ( ) ) == 0 :
return [ ] if split else " "
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
pagerank_scores = _pagerank ( graph )
# Adds the summa scores to the sentence objects.
_add_scores_to_sentences ( sentences , pagerank_scores )
# EDIT: return the whole sentences with scores
return sentences
# Extracts the most important sentences with the selected criterion.
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
# Sorts the extracted sentences by apparition order in the original text.
# extracted_sentences.sort(key=lambda s: s.index)
# return _format_results(extracted_sentences, split, scores)