custom_summarize with self made matrix

3 years ago · 40ea2cae20
parent d069ffece8
commit 40ea2cae20
5 changed files with 98 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -30,4 +30,12 @@ modify the variable `wikipedia_page` in `make.py` to whatever page then
 * using the `.content` method of python wikipedia, we get **plain text plus header in wikitext**, but things like `<p>`, `<ul>`, `<blockquote>`, etc all dissapeared. see if we want to craft a version using the `.html` method of python wikipedia, but it becomes more complex because of sentence tokenisation, probably need an index to keep track of their original div nested location.
 * **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?

-## [EXP] recommanded
+## [EXP] recommanded
+
+## [EXP] custom similarity
+
+### technical note
+
+* had to build a `similarity_graph` function to get the matrix of a text. 
+* the computation of those numbers is made in the `_get_similarity` function in `summarizer.py`, basically counting the words and dividing them by length of the sentence. the numbers can vary from approx 3.5 to 0 and are not symmetrized or normalized in any way. so it feels that we can input what we want lol
+* we want to input our own matrices, so we create a `set_graph_custom_edge_weights` and `custom_summarize`.
--- a/summa/pycache/edits.cpython-38.pyc
+++ b/summa/pycache/edits.cpython-38.pyc
--- a/summa/pycache/summarizer.cpython-38.pyc
+++ b/summa/pycache/summarizer.cpython-38.pyc
--- a/summa/edits.py
+++ b/summa/edits.py
@ -1,18 +1,33 @@
+from os import remove
 from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
 from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
 from .commons import build_graph as _build_graph
 from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
 from .summarizer import _set_graph_edge_weights
 from .summarizer import _add_scores_to_sentences
+from .summarizer import _create_valid_graph

+def count_words(line):
+    return len(line.split(" "))

-def scored_sentences(text, language="english", split=False, additional_stopwords=None):
+def remove_shorts(sentences, words_number):
+    return [s for s in sentences if count_words(s.text) > words_number]
+
+
+# SCORED SENTENCES (output the list of sentence with their scores)
+# -------------------------------------------------------------------------------------
+
+def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

+    # Remove sentence with less than words_number words.
+    if words_number:
+        sentences = remove_shorts(sentences, words_number)
+
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)
@ -33,15 +48,83 @@ def scored_sentences(text, language="english", split=False, additional_stopwords
    return sentences


-def similarity_graph(text, language="english", additional_stopwords=None):
+# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)
+# -------------------------------------------------------------------------------------
+
+def similarity_graph(text, words_number=0, language="english", additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

+    # Remove sentence with less than words_number words.
+    if words_number:
+        sentences = remove_shorts(sentences, words_number)
+
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

-    return graph
+    return graph
+
+
+# CUSTOM SUMMARIZATION (self made similarity matrix)
+# -------------------------------------------------------------------------------------
+
+def set_graph_custom_edge_weights(sentences, graph, matrix):
+    # allow us to input a custom 'hand-made' matrix and feed it to pagerank
+
+    for sentence_1 in graph.nodes():
+        for sentence_2 in graph.nodes():
+
+            s1_text = sentences[sentence_1].text
+            s2_text = sentences[sentence_2].text
+
+            edge = (sentence_1, sentence_2)
+            if sentence_1 != sentence_2 and not graph.has_edge(edge):
+                similarity = matrix[s1_text][s2_text]
+                if similarity != 0:
+                    graph.add_edge(edge, similarity)
+
+    # Handles the case in which all similarities are zero.
+    # The resultant summary will consist of random sentences.
+    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
+        _create_valid_graph(graph)
+
+
+def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
+    
+    if not isinstance(text, str):
+        raise ValueError("Text parameter must be a Unicode object (str)!")
+
+    # Gets a list of processed sentences.
+    sentences = _clean_text_by_sentences(text, language, additional_stopwords)
+
+    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
+    graph = _build_graph([sentence.token for sentence in sentences])
+    set_graph_custom_edge_weights(sentences, graph, matrix)
+
+    # Remove all nodes with all edges weights equal to zero.
+    _remove_unreachable_nodes(graph)
+
+    # PageRank cannot be run in an empty graph.
+    if len(graph.nodes()) == 0:
+        return [] if split else ""
+
+    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
+    pagerank_scores = _pagerank(graph)
+
+    # Adds the summa scores to the sentence objects.
+    _add_scores_to_sentences(sentences, pagerank_scores)
+
+    # EDIT: return the whole sentences with scores
+    return sentences
+
+    # Extracts the most important sentences with the selected criterion.
+    # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
+
+    # Sorts the extracted sentences by apparition order in the original text.
+    # extracted_sentences.sort(key=lambda s: s.index)
+
+    # return _format_results(extracted_sentences, split, scores)
--- a/summa/summarizer.py
+++ b/summa/summarizer.py
@ -133,16 +133,13 @@ def summarize(text, ratio=0.2, words=None, language="english", split=False, scor
    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

-    # EDIT: return the whole sentences with scores
-    return sentences
-
    # Extracts the most important sentences with the selected criterion.
-    # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
+    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
-    # extracted_sentences.sort(key=lambda s: s.index)
+    extracted_sentences.sort(key=lambda s: s.index)

-    # return _format_results(extracted_sentences, split, scores)
+    return _format_results(extracted_sentences, split, scores)


 def get_graph(text, language="english"):