custom_summarize with self made matrix

master
Dorian 2 years ago
parent d069ffece8
commit 40ea2cae20

@ -31,3 +31,11 @@ modify the variable `wikipedia_page` in `make.py` to whatever page then
* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ? * **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?
## [EXP] recommanded ## [EXP] recommanded
## [EXP] custom similarity
### technical note
* had to build a `similarity_graph` function to get the matrix of a text.
* the computation of those numbers is made in the `_get_similarity` function in `summarizer.py`, basically counting the words and dividing them by length of the sentence. the numbers can vary from approx 3.5 to 0 and are not symmetrized or normalized in any way. so it feels that we can input what we want lol
* we want to input our own matrices, so we create a `set_graph_custom_edge_weights` and `custom_summarize`.

@ -1,18 +1,33 @@
from os import remove
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
from .summarizer import _set_graph_edge_weights from .summarizer import _set_graph_edge_weights
from .summarizer import _add_scores_to_sentences from .summarizer import _add_scores_to_sentences
from .summarizer import _create_valid_graph
def count_words(line):
return len(line.split(" "))
def scored_sentences(text, language="english", split=False, additional_stopwords=None): def remove_shorts(sentences, words_number):
return [s for s in sentences if count_words(s.text) > words_number]
# SCORED SENTENCES (output the list of sentence with their scores)
# -------------------------------------------------------------------------------------
def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None):
if not isinstance(text, str): if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!") raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences. # Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords) sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Remove sentence with less than words_number words.
if words_number:
sentences = remove_shorts(sentences, words_number)
# Creates the graph and calculates the similarity coefficient for every pair of nodes. # Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences]) graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph) _set_graph_edge_weights(graph)
@ -33,15 +48,83 @@ def scored_sentences(text, language="english", split=False, additional_stopwords
return sentences return sentences
def similarity_graph(text, language="english", additional_stopwords=None): # SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)
# -------------------------------------------------------------------------------------
def similarity_graph(text, words_number=0, language="english", additional_stopwords=None):
if not isinstance(text, str): if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!") raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences. # Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords) sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Remove sentence with less than words_number words.
if words_number:
sentences = remove_shorts(sentences, words_number)
# Creates the graph and calculates the similarity coefficient for every pair of nodes. # Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences]) graph = _build_graph([sentence.token for sentence in sentences])
_set_graph_edge_weights(graph) _set_graph_edge_weights(graph)
return graph return graph
# CUSTOM SUMMARIZATION (self made similarity matrix)
# -------------------------------------------------------------------------------------
def set_graph_custom_edge_weights(sentences, graph, matrix):
# allow us to input a custom 'hand-made' matrix and feed it to pagerank
for sentence_1 in graph.nodes():
for sentence_2 in graph.nodes():
s1_text = sentences[sentence_1].text
s2_text = sentences[sentence_2].text
edge = (sentence_1, sentence_2)
if sentence_1 != sentence_2 and not graph.has_edge(edge):
similarity = matrix[s1_text][s2_text]
if similarity != 0:
graph.add_edge(edge, similarity)
# Handles the case in which all similarities are zero.
# The resultant summary will consist of random sentences.
if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
_create_valid_graph(graph)
def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
if not isinstance(text, str):
raise ValueError("Text parameter must be a Unicode object (str)!")
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text, language, additional_stopwords)
# Creates the graph and calculates the similarity coefficient for every pair of nodes.
graph = _build_graph([sentence.token for sentence in sentences])
set_graph_custom_edge_weights(sentences, graph, matrix)
# Remove all nodes with all edges weights equal to zero.
_remove_unreachable_nodes(graph)
# PageRank cannot be run in an empty graph.
if len(graph.nodes()) == 0:
return [] if split else ""
# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
pagerank_scores = _pagerank(graph)
# Adds the summa scores to the sentence objects.
_add_scores_to_sentences(sentences, pagerank_scores)
# EDIT: return the whole sentences with scores
return sentences
# Extracts the most important sentences with the selected criterion.
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
# Sorts the extracted sentences by apparition order in the original text.
# extracted_sentences.sort(key=lambda s: s.index)
# return _format_results(extracted_sentences, split, scores)

@ -133,16 +133,13 @@ def summarize(text, ratio=0.2, words=None, language="english", split=False, scor
# Adds the summa scores to the sentence objects. # Adds the summa scores to the sentence objects.
_add_scores_to_sentences(sentences, pagerank_scores) _add_scores_to_sentences(sentences, pagerank_scores)
# EDIT: return the whole sentences with scores
return sentences
# Extracts the most important sentences with the selected criterion. # Extracts the most important sentences with the selected criterion.
# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)
# Sorts the extracted sentences by apparition order in the original text. # Sorts the extracted sentences by apparition order in the original text.
# extracted_sentences.sort(key=lambda s: s.index) extracted_sentences.sort(key=lambda s: s.index)
# return _format_results(extracted_sentences, split, scores) return _format_results(extracted_sentences, split, scores)
def get_graph(text, language="english"): def get_graph(text, language="english"):

Loading…
Cancel
Save