diff --git a/README.md b/README.md index 50fc032..6bb6b57 100644 --- a/README.md +++ b/README.md @@ -30,4 +30,12 @@ modify the variable `wikipedia_page` in `make.py` to whatever page then * using the `.content` method of python wikipedia, we get **plain text plus header in wikitext**, but things like `
`, `
`, etc all dissapeared. see if we want to craft a version using the `.html` method of python wikipedia, but it becomes more complex because of sentence tokenisation, probably need an index to keep track of their original div nested location. * **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ? -## [EXP] recommanded \ No newline at end of file +## [EXP] recommanded + +## [EXP] custom similarity + +### technical note + +* had to build a `similarity_graph` function to get the matrix of a text. +* the computation of those numbers is made in the `_get_similarity` function in `summarizer.py`, basically counting the words and dividing them by length of the sentence. the numbers can vary from approx 3.5 to 0 and are not symmetrized or normalized in any way. so it feels that we can input what we want lol +* we want to input our own matrices, so we create a `set_graph_custom_edge_weights` and `custom_summarize`. \ No newline at end of file diff --git a/summa/__pycache__/edits.cpython-38.pyc b/summa/__pycache__/edits.cpython-38.pyc index e9ccb7b..bf3586b 100644 Binary files a/summa/__pycache__/edits.cpython-38.pyc and b/summa/__pycache__/edits.cpython-38.pyc differ diff --git a/summa/__pycache__/summarizer.cpython-38.pyc b/summa/__pycache__/summarizer.cpython-38.pyc index cd47653..821c63b 100644 Binary files a/summa/__pycache__/summarizer.cpython-38.pyc and b/summa/__pycache__/summarizer.cpython-38.pyc differ diff --git a/summa/edits.py b/summa/edits.py index b77fb96..b0fd654 100644 --- a/summa/edits.py +++ b/summa/edits.py @@ -1,18 +1,33 @@ +from os import remove from .pagerank_weighted import pagerank_weighted_scipy as _pagerank from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from .commons import build_graph as _build_graph from .commons import remove_unreachable_nodes as _remove_unreachable_nodes from .summarizer import _set_graph_edge_weights from .summarizer import _add_scores_to_sentences +from .summarizer import _create_valid_graph +def count_words(line): + return len(line.split(" ")) -def scored_sentences(text, language="english", split=False, additional_stopwords=None): +def remove_shorts(sentences, words_number): + return [s for s in sentences if count_words(s.text) > words_number] + + +# SCORED SENTENCES (output the list of sentence with their scores) +# ------------------------------------------------------------------------------------- + +def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) + # Remove sentence with less than words_number words. + if words_number: + sentences = remove_shorts(sentences, words_number) + # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) @@ -33,15 +48,83 @@ def scored_sentences(text, language="english", split=False, additional_stopwords return sentences -def similarity_graph(text, language="english", additional_stopwords=None): +# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges) +# ------------------------------------------------------------------------------------- + +def similarity_graph(text, words_number=0, language="english", additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language, additional_stopwords) + # Remove sentence with less than words_number words. + if words_number: + sentences = remove_shorts(sentences, words_number) + # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) - return graph \ No newline at end of file + return graph + + +# CUSTOM SUMMARIZATION (self made similarity matrix) +# ------------------------------------------------------------------------------------- + +def set_graph_custom_edge_weights(sentences, graph, matrix): + # allow us to input a custom 'hand-made' matrix and feed it to pagerank + + for sentence_1 in graph.nodes(): + for sentence_2 in graph.nodes(): + + s1_text = sentences[sentence_1].text + s2_text = sentences[sentence_2].text + + edge = (sentence_1, sentence_2) + if sentence_1 != sentence_2 and not graph.has_edge(edge): + similarity = matrix[s1_text][s2_text] + if similarity != 0: + graph.add_edge(edge, similarity) + + # Handles the case in which all similarities are zero. + # The resultant summary will consist of random sentences. + if all(graph.edge_weight(edge) == 0 for edge in graph.edges()): + _create_valid_graph(graph) + + +def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): + + if not isinstance(text, str): + raise ValueError("Text parameter must be a Unicode object (str)!") + + # Gets a list of processed sentences. + sentences = _clean_text_by_sentences(text, language, additional_stopwords) + + # Creates the graph and calculates the similarity coefficient for every pair of nodes. + graph = _build_graph([sentence.token for sentence in sentences]) + set_graph_custom_edge_weights(sentences, graph, matrix) + + # Remove all nodes with all edges weights equal to zero. + _remove_unreachable_nodes(graph) + + # PageRank cannot be run in an empty graph. + if len(graph.nodes()) == 0: + return [] if split else "" + + # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score + pagerank_scores = _pagerank(graph) + + # Adds the summa scores to the sentence objects. + _add_scores_to_sentences(sentences, pagerank_scores) + + # EDIT: return the whole sentences with scores + return sentences + + # Extracts the most important sentences with the selected criterion. + # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) + + # Sorts the extracted sentences by apparition order in the original text. + # extracted_sentences.sort(key=lambda s: s.index) + + # return _format_results(extracted_sentences, split, scores) \ No newline at end of file diff --git a/summa/summarizer.py b/summa/summarizer.py index 952625e..1ab296d 100644 --- a/summa/summarizer.py +++ b/summa/summarizer.py @@ -133,16 +133,13 @@ def summarize(text, ratio=0.2, words=None, language="english", split=False, scor # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) - # EDIT: return the whole sentences with scores - return sentences - # Extracts the most important sentences with the selected criterion. - # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) + extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. - # extracted_sentences.sort(key=lambda s: s.index) + extracted_sentences.sort(key=lambda s: s.index) - # return _format_results(extracted_sentences, split, scores) + return _format_results(extracted_sentences, split, scores) def get_graph(text, language="english"):