thewarehouseandtheforest/summa/summarizer.py

from math import log10

from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes


def _set_graph_edge_weights(graph):
    for sentence_1 in graph.nodes():
        for sentence_2 in graph.nodes():

            edge = (sentence_1, sentence_2)
            if sentence_1 != sentence_2 and not graph.has_edge(edge):
                similarity = _get_similarity(sentence_1, sentence_2)
                if similarity != 0:
                    graph.add_edge(edge, similarity)

    # Handles the case in which all similarities are zero.
    # The resultant summary will consist of random sentences.
    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
        _create_valid_graph(graph)


def _create_valid_graph(graph):
    nodes = graph.nodes()

    for i in range(len(nodes)):
        for j in range(len(nodes)):
            if i == j:
                continue

            edge = (nodes[i], nodes[j])

            if graph.has_edge(edge):
                graph.del_edge(edge)

            graph.add_edge(edge, 1)


def _get_similarity(s1, s2):
    words_sentence_one = s1.split()
    words_sentence_two = s2.split()

    common_word_count = _count_common_words(words_sentence_one, words_sentence_two)

    log_s1 = log10(len(words_sentence_one))
    log_s2 = log10(len(words_sentence_two))

    if log_s1 + log_s2 == 0:
        return 0

    return common_word_count / (log_s1 + log_s2)


def _count_common_words(words_sentence_one, words_sentence_two):
    return len(set(words_sentence_one) & set(words_sentence_two))


def _format_results(extracted_sentences, split, score):
    if score:
        return [(sentence.text, sentence.score) for sentence in extracted_sentences]
    if split:
        return [sentence.text for sentence in extracted_sentences]
    return "\n".join([sentence.text for sentence in extracted_sentences])


def _add_scores_to_sentences(sentences, scores):
    for sentence in sentences:
        # Adds the score to the object if it has one.
        if sentence.token in scores:
            sentence.score = scores[sentence.token]
        else:
            sentence.score = 0


def _get_sentences_with_word_count(sentences, words):
    """ Given a list of sentences, returns a list of sentences with a
    total word count similar to the word count provided.
    """
    word_count = 0
    selected_sentences = []
    # Loops until the word count is reached.
    for sentence in sentences:
        words_in_sentence = len(sentence.text.split())

        # Checks if the inclusion of the sentence gives a better approximation
        # to the word parameter.
        if abs(words - word_count - words_in_sentence) > abs(words - word_count):
            return selected_sentences

        selected_sentences.append(sentence)
        word_count += words_in_sentence

    return selected_sentences


def _extract_most_important_sentences(sentences, ratio, words):
    sentences.sort(key=lambda s: s.score, reverse=True)

    # If no "words" option is selected, the number of sentences is
    # reduced by the provided ratio.
    if words is None:
        length = len(sentences) * ratio
        return sentences[:int(length)]

    # Else, the ratio is ignored.
    else:
        return _get_sentences_with_word_count(sentences, words)


def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # EDIT: return the whole sentences with scores
    return sentences

    # Extracts the most important sentences with the selected criterion.
    # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    # extracted_sentences.sort(key=lambda s: s.index)

    # return _format_results(extracted_sentences, split, scores)


def get_graph(text, language="english"):
    sentences = _clean_text_by_sentences(text, language)

    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    return graph
first experiment with opacity 2 years ago			`from math import log10`

			`from .pagerank_weighted import pagerank_weighted_scipy as _pagerank`
			`from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences`
			`from .commons import build_graph as _build_graph`
			`from .commons import remove_unreachable_nodes as _remove_unreachable_nodes`


			`def _set_graph_edge_weights(graph):`
			`for sentence_1 in graph.nodes():`
			`for sentence_2 in graph.nodes():`

			`edge = (sentence_1, sentence_2)`
			`if sentence_1 != sentence_2 and not graph.has_edge(edge):`
			`similarity = _get_similarity(sentence_1, sentence_2)`
			`if similarity != 0:`
			`graph.add_edge(edge, similarity)`

			`# Handles the case in which all similarities are zero.`
			`# The resultant summary will consist of random sentences.`
			`if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):`
			`_create_valid_graph(graph)`


			`def _create_valid_graph(graph):`
			`nodes = graph.nodes()`

			`for i in range(len(nodes)):`
			`for j in range(len(nodes)):`
			`if i == j:`
			`continue`

			`edge = (nodes[i], nodes[j])`

			`if graph.has_edge(edge):`
			`graph.del_edge(edge)`

			`graph.add_edge(edge, 1)`


			`def _get_similarity(s1, s2):`
			`words_sentence_one = s1.split()`
			`words_sentence_two = s2.split()`

			`common_word_count = _count_common_words(words_sentence_one, words_sentence_two)`

			`log_s1 = log10(len(words_sentence_one))`
			`log_s2 = log10(len(words_sentence_two))`

			`if log_s1 + log_s2 == 0:`
			`return 0`

			`return common_word_count / (log_s1 + log_s2)`


			`def _count_common_words(words_sentence_one, words_sentence_two):`
			`return len(set(words_sentence_one) & set(words_sentence_two))`


			`def _format_results(extracted_sentences, split, score):`
			`if score:`
			`return [(sentence.text, sentence.score) for sentence in extracted_sentences]`
			`if split:`
			`return [sentence.text for sentence in extracted_sentences]`
			`return "\n".join([sentence.text for sentence in extracted_sentences])`


			`def _add_scores_to_sentences(sentences, scores):`
			`for sentence in sentences:`
			`# Adds the score to the object if it has one.`
			`if sentence.token in scores:`
			`sentence.score = scores[sentence.token]`
			`else:`
			`sentence.score = 0`


			`def _get_sentences_with_word_count(sentences, words):`
			`""" Given a list of sentences, returns a list of sentences with a`
			`total word count similar to the word count provided.`
			`"""`
			`word_count = 0`
			`selected_sentences = []`
			`# Loops until the word count is reached.`
			`for sentence in sentences:`
			`words_in_sentence = len(sentence.text.split())`

			`# Checks if the inclusion of the sentence gives a better approximation`
			`# to the word parameter.`
			`if abs(words - word_count - words_in_sentence) > abs(words - word_count):`
			`return selected_sentences`

			`selected_sentences.append(sentence)`
			`word_count += words_in_sentence`

			`return selected_sentences`


			`def _extract_most_important_sentences(sentences, ratio, words):`
			`sentences.sort(key=lambda s: s.score, reverse=True)`

			`# If no "words" option is selected, the number of sentences is`
			`# reduced by the provided ratio.`
			`if words is None:`
			`length = len(sentences) * ratio`
			`return sentences[:int(length)]`

			`# Else, the ratio is ignored.`
			`else:`
			`return _get_sentences_with_word_count(sentences, words)`


			`def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):`
			`if not isinstance(text, str):`
			`raise ValueError("Text parameter must be a Unicode object (str)!")`

			`# Gets a list of processed sentences.`
			`sentences = _clean_text_by_sentences(text, language, additional_stopwords)`

			`# Creates the graph and calculates the similarity coefficient for every pair of nodes.`
			`graph = _build_graph([sentence.token for sentence in sentences])`
			`_set_graph_edge_weights(graph)`

			`# Remove all nodes with all edges weights equal to zero.`
			`_remove_unreachable_nodes(graph)`

			`# PageRank cannot be run in an empty graph.`
			`if len(graph.nodes()) == 0:`
			`return [] if split else ""`

			`# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score`
			`pagerank_scores = _pagerank(graph)`

			`# Adds the summa scores to the sentence objects.`
			`_add_scores_to_sentences(sentences, pagerank_scores)`

			`# EDIT: return the whole sentences with scores`
			`return sentences`

			`# Extracts the most important sentences with the selected criterion.`
			`# extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)`

			`# Sorts the extracted sentences by apparition order in the original text.`
			`# extracted_sentences.sort(key=lambda s: s.index)`

			`# return _format_results(extracted_sentences, split, scores)`


			`def get_graph(text, language="english"):`
			`sentences = _clean_text_by_sentences(text, language)`

			`graph = _build_graph([sentence.token for sentence in sentences])`
			`_set_graph_edge_weights(graph)`

			`return graph`