thewarehouseandtheforest/summa/edits.py

from os import remove
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
from .summarizer import _set_graph_edge_weights
from .summarizer import _add_scores_to_sentences
from .summarizer import _create_valid_graph

def count_words(line):
    return len(line.split(" "))

def remove_shorts(sentences, words_number):
    return [s for s in sentences if count_words(s.text) > words_number]


# SCORED SENTENCES (output the list of sentence with their scores)
# -------------------------------------------------------------------------------------

def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

    # Remove sentence with less than words_number words.
    if words_number:
        sentences = remove_shorts(sentences, words_number)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    return sentences


# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)
# -------------------------------------------------------------------------------------

def similarity_graph(text, words_number=0, language="english", additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

    # Remove sentence with less than words_number words.
    if words_number:
        sentences = remove_shorts(sentences, words_number)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    return graph


# CUSTOM SUMMARIZATION (self made similarity matrix)
# -------------------------------------------------------------------------------------

def set_graph_custom_edge_weights(sentences, graph, matrix):
    # allow us to input a custom 'hand-made' matrix and feed it to pagerank

    for sentence_1 in graph.nodes():
        for sentence_2 in graph.nodes():

            s1_text = sentences[sentence_1].text
            s2_text = sentences[sentence_2].text

            edge = (sentence_1, sentence_2)
            if sentence_1 != sentence_2 and not graph.has_edge(edge):
                similarity = matrix[s1_text][s2_text]
                if similarity != 0:
                    graph.add_edge(edge, similarity)

    # Handles the case in which all similarities are zero.
    # The resultant summary will consist of random sentences.
    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
        _create_valid_graph(graph)


def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
    
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    set_graph_custom_edge_weights(sentences, graph, matrix)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # EDIT: return the whole sentences with scores
    return sentences

    # Extracts the most important sentences with the selected criterion.
    # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    # extracted_sentences.sort(key=lambda s: s.index)

    # return _format_results(extracted_sentences, split, scores)
custom_summarize with self made matrix 2 years ago			`from os import remove`
clean 2 years ago			`from .pagerank_weighted import pagerank_weighted_scipy as _pagerank`
			`from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences`
			`from .commons import build_graph as _build_graph`
			`from .commons import remove_unreachable_nodes as _remove_unreachable_nodes`
			`from .summarizer import _set_graph_edge_weights`
			`from .summarizer import _add_scores_to_sentences`
custom_summarize with self made matrix 2 years ago			`from .summarizer import _create_valid_graph`
clean 2 years ago
custom_summarize with self made matrix 2 years ago			`def count_words(line):`
			`return len(line.split(" "))`
clean 2 years ago
custom_summarize with self made matrix 2 years ago			`def remove_shorts(sentences, words_number):`
			`return [s for s in sentences if count_words(s.text) > words_number]`


			`# SCORED SENTENCES (output the list of sentence with their scores)`
			`# -------------------------------------------------------------------------------------`

			`def scored_sentences(text, words_number=0, language="english", split=False, additional_stopwords=None):`
clean 2 years ago			`if not isinstance(text, str):`
			`raise ValueError("Text parameter must be a Unicode object (str)!")`

			`# Gets a list of processed sentences.`
			`sentences = _clean_text_by_sentences(text, language, additional_stopwords)`

custom_summarize with self made matrix 2 years ago			`# Remove sentence with less than words_number words.`
			`if words_number:`
			`sentences = remove_shorts(sentences, words_number)`

clean 2 years ago			`# Creates the graph and calculates the similarity coefficient for every pair of nodes.`
			`graph = _build_graph([sentence.token for sentence in sentences])`
			`_set_graph_edge_weights(graph)`

			`# Remove all nodes with all edges weights equal to zero.`
			`_remove_unreachable_nodes(graph)`

			`# PageRank cannot be run in an empty graph.`
			`if len(graph.nodes()) == 0:`
			`return [] if split else ""`

			`# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score`
			`pagerank_scores = _pagerank(graph)`

			`# Adds the summa scores to the sentence objects.`
			`_add_scores_to_sentences(sentences, pagerank_scores)`

wikipage getter in a separate folder so its also usable from different experiments 2 years ago			`return sentences`


custom_summarize with self made matrix 2 years ago			`# SIMILARITY GRAPH (output the whole matrix/graph of similarity with weighted edges)`
			`# -------------------------------------------------------------------------------------`

			`def similarity_graph(text, words_number=0, language="english", additional_stopwords=None):`
wikipage getter in a separate folder so its also usable from different experiments 2 years ago			`if not isinstance(text, str):`
			`raise ValueError("Text parameter must be a Unicode object (str)!")`

			`# Gets a list of processed sentences.`
			`sentences = _clean_text_by_sentences(text, language, additional_stopwords)`

custom_summarize with self made matrix 2 years ago			`# Remove sentence with less than words_number words.`
			`if words_number:`
			`sentences = remove_shorts(sentences, words_number)`

wikipage getter in a separate folder so its also usable from different experiments 2 years ago			`# Creates the graph and calculates the similarity coefficient for every pair of nodes.`
			`graph = _build_graph([sentence.token for sentence in sentences])`
			`_set_graph_edge_weights(graph)`

custom_summarize with self made matrix 2 years ago			`return graph`


			`# CUSTOM SUMMARIZATION (self made similarity matrix)`
			`# -------------------------------------------------------------------------------------`

			`def set_graph_custom_edge_weights(sentences, graph, matrix):`
			`# allow us to input a custom 'hand-made' matrix and feed it to pagerank`

			`for sentence_1 in graph.nodes():`
			`for sentence_2 in graph.nodes():`

			`s1_text = sentences[sentence_1].text`
			`s2_text = sentences[sentence_2].text`

			`edge = (sentence_1, sentence_2)`
			`if sentence_1 != sentence_2 and not graph.has_edge(edge):`
			`similarity = matrix[s1_text][s2_text]`
			`if similarity != 0:`
			`graph.add_edge(edge, similarity)`

			`# Handles the case in which all similarities are zero.`
			`# The resultant summary will consist of random sentences.`
			`if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):`
			`_create_valid_graph(graph)`


			`def custom_summarize(text, matrix, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):`

			`if not isinstance(text, str):`
			`raise ValueError("Text parameter must be a Unicode object (str)!")`

			`# Gets a list of processed sentences.`
			`sentences = _clean_text_by_sentences(text, language, additional_stopwords)`

			`# Creates the graph and calculates the similarity coefficient for every pair of nodes.`
			`graph = _build_graph([sentence.token for sentence in sentences])`
			`set_graph_custom_edge_weights(sentences, graph, matrix)`

			`# Remove all nodes with all edges weights equal to zero.`
			`_remove_unreachable_nodes(graph)`

			`# PageRank cannot be run in an empty graph.`
			`if len(graph.nodes()) == 0:`
			`return [] if split else ""`

			`# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score`
			`pagerank_scores = _pagerank(graph)`

			`# Adds the summa scores to the sentence objects.`
			`_add_scores_to_sentences(sentences, pagerank_scores)`

			`# EDIT: return the whole sentences with scores`
			`return sentences`

			`# Extracts the most important sentences with the selected criterion.`
			`# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)`

			`# Sorts the extracted sentences by apparition order in the original text.`
			`# extracted_sentences.sort(key=lambda s: s.index)`

			`# return _format_results(extracted_sentences, split, scores)`