You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.8 KiB
Python
47 lines
1.8 KiB
Python
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
|
|
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
|
|
from .commons import build_graph as _build_graph
|
|
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
|
from .summarizer import _set_graph_edge_weights
|
|
from .summarizer import _add_scores_to_sentences
|
|
|
|
|
|
def scored_sentences(text, language="english", split=False, additional_stopwords=None):
|
|
if not isinstance(text, str):
|
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
|
|
|
# Gets a list of processed sentences.
|
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
|
|
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
|
graph = _build_graph([sentence.token for sentence in sentences])
|
|
_set_graph_edge_weights(graph)
|
|
|
|
# Remove all nodes with all edges weights equal to zero.
|
|
_remove_unreachable_nodes(graph)
|
|
|
|
# PageRank cannot be run in an empty graph.
|
|
if len(graph.nodes()) == 0:
|
|
return [] if split else ""
|
|
|
|
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
|
|
pagerank_scores = _pagerank(graph)
|
|
|
|
# Adds the summa scores to the sentence objects.
|
|
_add_scores_to_sentences(sentences, pagerank_scores)
|
|
|
|
return sentences
|
|
|
|
|
|
def similarity_graph(text, language="english", additional_stopwords=None):
|
|
if not isinstance(text, str):
|
|
raise ValueError("Text parameter must be a Unicode object (str)!")
|
|
|
|
# Gets a list of processed sentences.
|
|
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
|
|
|
|
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
|
|
graph = _build_graph([sentence.token for sentence in sentences])
|
|
_set_graph_edge_weights(graph)
|
|
|
|
return graph |