thewarehouseandtheforest/exp.opacity/make.py

from jinja2 import Template
import os
import wikipedia
from markdown import markdown

# importing module
import sys

# appending a path
sys.path.append('../')

# importing required module
import summa.summarizer
from summa.summarizer import scored_sentences


# TODO:
# * DONE: wiki header

# those 3 would ask to start from the HTML itself and keep and index...
# * wiki paragraph
# * wiki hyperlinks
# * list


#   variables
#   ------------------------------------------------------------------------

# wikipedia_page = "forest"
# wikipedia_page = "warehouse"
# wikipedia_page = "river"
wikipedia_page = "elderflower"
# wikipedia_page = "mushroom"

TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'


#   utilities
#   ------------------------------------------------------------------------

def map_value(value, min, max, new_min, new_max):
  return (((value - min) / (max - min)) * (new_max - new_min)) + new_min

def remap_score(s, min_score, max_score):
    s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
    return s

def compress_score(s):

    # compress whites
    s.score = s.score**3

    # stretch + limiter
    # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
    s.score = 1 if s.score > 0.8 else s.score

    return s


#   wikipedia
#   ------------------------------------------------------------------------

def wikipage(pagename):
    # get wikipedia page content by name of the page

    print(pagename)
    wikipedia.set_lang("en")
    try:
        results = wikipedia.search(pagename, results=1, suggestion=False)
        try:
            pagename = results[0]
        except IndexError:
            # if there is no suggestion or search results, the page doesn't exist
            raise wikipedia.PageError(pagename)
        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        page = ''

    return page


#   parsing and gluing html
#   ------------------------------------------------------------------------

def is_header(s):

    # i is the header level
    i = 0
    while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
        i += 1

    if i > 0:
        header_text = s.text[i:(-1-i)].strip()
        header_level = i
        return [header_text, header_level]

def wiki_parse(sentences):

    # TODO: doesn't work with section nesting!!
    # 1. replace wikitext header with html header
    # 2. add the opacity to each elements
    # 3. compute an artificial score for header that is an average of the score of the section

    new_sentences = []

    print('--- HEADERS ---')
    for i in range(len(sentences)):

        s = sentences[i]

        # if sentences is header
        header = is_header(s)
        if header:
            print(header[0])

            # start computing the average of score of this section
            current_total = 0
            current_count = 0
            next_header_found = False
            j = i + 1

            # iterating while we find next header with greatest or same level
            while j < len(sentences) and not next_header_found:

                s2 = sentences[j]
                s2_header = is_header(s2)

                if s2_header:
                    print('  ' + s2_header[0])
                    if header[1] >= s2_header[1]:
                        # encounter header of higher level
                        next_header_found = True
                        print('X ' + s2_header[0])

                else:
                    # adding every sentence to the average
                    current_total += s2.score
                    current_count += 1

                j += 1

            if current_count != 0:
                s.score = current_total / current_count
            else:
                s.score = "NaN"

            s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'

            # stops at the references part
            if header[0] == "References" or header[0] == "See also":
                break

            new_sentences.append(s)

        # not a header
        else:
            s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
            new_sentences.append(s)

    return new_sentences


#   textrank
#   ------------------------------------------------------------------------

def txt2rankedsentences(txt):
    # from txt to ranked sentences
    return scored_sentences(txt, split=True)


#   main
#   ------------------------------------------------------------------------

if __name__ == '__main__':


    # --- WIKI REQUEST ---

    # get text from wikipedia
    print('--- WIKI ---')
    page = wikipage(wikipedia_page)
    if not page:
        sys.exit("--- STOP ---")
    title = '<h1>'+page.title+'</h1>'
    text = page.content

    # print text in terminal
    print('--- TXT ---')
    print(text)


    # --- APPLY TEXTRANK ---

    # apply textrank
    sentences = txt2rankedsentences(text)

    # print ranked sentences in terminal
    print('--- SENTENCES ---')
    for s in sentences:
        print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))


    # --- REMAP AND COMPRESS ---

    # sorted version of the list
    sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
    # remap sentences from 0 to 1
    max_score = sorted_sentences[0].score
    min_score = sorted_sentences[-1].score
    sentences = [remap_score(s, min_score, max_score) for s in sentences]
    # compress scores (make more stuff invisible)
    sentences = [compress_score(s) for s in sentences]


    # -- PARSE ---

    # parse every sentences to either span or header
    sentences = wiki_parse(sentences)
    # add back page title
    sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences


    # -- TEMPLATING ---

    # getting the template
    with open(TEMPLATE_PATH, 'r') as file:
        template = Template(file.read())
    # render template
    html = template.render(sentences = sentences)
    with open(HTML_PATH, 'w') as file:
        file.write(html)