from os import remove from .pagerank_weighted import pagerank_weighted_scipy as _pagerank from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from .commons import build_graph as _build_graph from .commons import remove_unreachable_nodes as _remove_unreachable_nodes from .summarizer import _set_graph_edge_weights from .summarizer import _add_scores_to_sentences from .summarizer import _create_valid_graph def count_words(line): return len(line.split(" ")) def remove_shorts(sentences, words_number): return [s for s in sentences if count_words(s.text) > words_number] # SCORED SENTENCES (output the list of sentence with their scores) # ------------------------------------------------------------------------------------- def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) # Remove sentence with less than words_number words. if words_number: sentences = remove_shorts(sentences, words_number) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) return sentences # SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges) # ------------------------------------------------------------------------------------- def similarity_graph(text, words_number=0, language="english", additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) # Remove sentence with less than words_number words. if words_number: sentences = remove_shorts(sentences, words_number) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) return graph # CUSTOM SUMMARIZATION (self made similarity matrix) # ------------------------------------------------------------------------------------- def set_graph_custom_edge_weights(sentences, graph, matrix): # allow us to input a custom 'hand-made' matrix and feed it to pagerank for sentence_1 in graph.nodes(): for sentence_2 in graph.nodes(): s1_text = sentences[sentence_1].text s2_text = sentences[sentence_2].text edge = (sentence_1, sentence_2) if sentence_1 != sentence_2 and not graph.has_edge(edge): similarity = matrix[s1_text][s2_text] if similarity != 0: graph.add_edge(edge, similarity) # Handles the case in which all similarities are zero. # The resultant summary will consist of random sentences. if all(graph.edge_weight(edge) == 0 for edge in graph.edges()): _create_valid_graph(graph) def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) set_graph_custom_edge_weights(sentences, graph, matrix) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # EDIT: return the whole sentences with scores return sentences # Extracts the most important sentences with the selected criterion. # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. # extracted_sentences.sort(key=lambda s: s.index) # return _format_results(extracted_sentences, split, scores)