You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

47 lines
1.8 KiB
Python

from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
from .summarizer import _set_graph_edge_weights
from .summarizer import _add_scores_to_sentences
def scored_sentences(text, language="english", split=False, additional_stopwords=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph)
# Remove all nodes with all edges weights equal to zero.
_remove_unreachable_nodes(graph)
# PageRank cannot be run in an empty graph.
if len(graph.nodes()) == 0:
return [] if split else ""
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
pagerank_scores = _pagerank(graph)
# Adds the summa scores to the sentence objects.
_add_scores_to_sentences(sentences, pagerank_scores)
return sentences
def similarity_graph(text, language="english", additional_stopwords=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph)
return graph