from math import log10 from .pagerank_weighted import pagerank_weighted_scipy as _pagerank from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from .commons import build_graph as _build_graph from .commons import remove_unreachable_nodes as _remove_unreachable_nodes def _set_graph_edge_weights(graph): for sentence_1 in graph.nodes(): for sentence_2 in graph.nodes(): edge = (sentence_1, sentence_2) if sentence_1 != sentence_2 and not graph.has_edge(edge): similarity = _get_similarity(sentence_1, sentence_2) if similarity != 0: graph.add_edge(edge, similarity) # Handles the case in which all similarities are zero. # The resultant summary will consist of random sentences. if all(graph.edge_weight(edge) == 0 for edge in graph.edges()): _create_valid_graph(graph) def _create_valid_graph(graph): nodes = graph.nodes() for i in range(len(nodes)): for j in range(len(nodes)): if i == j: continue edge = (nodes[i], nodes[j]) if graph.has_edge(edge): graph.del_edge(edge) graph.add_edge(edge, 1) def _get_similarity(s1, s2): words_sentence_one = s1.split() words_sentence_two = s2.split() common_word_count = _count_common_words(words_sentence_one, words_sentence_two) log_s1 = log10(len(words_sentence_one)) log_s2 = log10(len(words_sentence_two)) if log_s1 + log_s2 == 0: return 0 return common_word_count / (log_s1 + log_s2) def _count_common_words(words_sentence_one, words_sentence_two): return len(set(words_sentence_one) & set(words_sentence_two)) def _format_results(extracted_sentences, split, score): if score: return [(sentence.text, sentence.score) for sentence in extracted_sentences] if split: return [sentence.text for sentence in extracted_sentences] return "\n".join([sentence.text for sentence in extracted_sentences]) def _add_scores_to_sentences(sentences, scores): for sentence in sentences: # Adds the score to the object if it has one. if sentence.token in scores: sentence.score = scores[sentence.token] else: sentence.score = 0 def _get_sentences_with_word_count(sentences, words): """ Given a list of sentences, returns a list of sentences with a total word count similar to the word count provided. """ word_count = 0 selected_sentences = [] # Loops until the word count is reached. for sentence in sentences: words_in_sentence = len(sentence.text.split()) # Checks if the inclusion of the sentence gives a better approximation # to the word parameter. if abs(words - word_count - words_in_sentence) > abs(words - word_count): return selected_sentences selected_sentences.append(sentence) word_count += words_in_sentence return selected_sentences def _extract_most_important_sentences(sentences, ratio, words): sentences.sort(key=lambda s: s.score, reverse=True) # If no "words" option is selected, the number of sentences is # reduced by the provided ratio. if words is None: length = len(sentences) * ratio return sentences[:int(length)] # Else, the ratio is ignored. else: return _get_sentences_with_word_count(sentences, words) def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores) def get_graph(text, language="english"): sentences = _clean_text_by_sentences(text, language) graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) return graph