thewarehouseandtheforest/make.py

from jinja2 import Template
import os
import wikipedia
from markdown import markdown

# importing module
import sys
  
# appending a path
# sys.path.append('textrank')
  
# importing required module
import summa.summarizer
from summa.summarizer import summarize


# TODO:
# * DONE: wiki header

# those 3 would ask to start from the HTML itself and keep and index...
# * wiki paragraph
# * wiki hyperlinks
# * list


#   variables
#   ------------------------------------------------------------------------

# wikipedia_page = "forest"
# wikipedia_page = "warehouse"
# wikipedia_page = "river"
wikipedia_page = "elderflower"
# wikipedia_page = "mushroom"

TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'


#   utilities
#   ------------------------------------------------------------------------

def map_value(value, min, max, new_min, new_max):
  return (((value - min) / (max - min)) * (new_max - new_min)) + new_min

def remap_score(s, min_score, max_score):
    s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
    return s

def compress_score(s):

    # compress whites
    s.score = s.score**3

    # stretch + limiter
    # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
    s.score = 1 if s.score > 0.8 else s.score 

    return s


#   wikipedia
#   ------------------------------------------------------------------------

def wikipage(pagename):
    # get wikipedia page content by name of the page

    print(pagename)
    wikipedia.set_lang("en")
    try:
        results = wikipedia.search(pagename, results=1, suggestion=False)
        try:
            pagename = results[0]
        except IndexError:
            # if there is no suggestion or search results, the page doesn't exist
            raise wikipedia.PageError(pagename)
        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        page = ''

    return page


#   parsing and gluing html
#   ------------------------------------------------------------------------

def is_header(s):

    # i is the header level
    i = 0
    while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
        i += 1

    if i > 0:
        header_text = s.text[i:(-1-i)].strip()
        header_level = i
        return [header_text, header_level]

def wiki_parse(sentences):

    # TODO: doesn't work with section nesting!!
    # 1. replace wikitext header with html header
    # 2. add the opacity to each elements
    # 3. compute an artificial score for header that is an average of the score of the section

    new_sentences = []

    print('--- HEADERS ---')
    for i in range(len(sentences)):

        s = sentences[i]

        # if sentences is header
        header = is_header(s)
        if header:
            print(header[0])

            # start computing the average of score of this section
            current_total = 0
            current_count = 0
            next_header_found = False
            j = i + 1

            # iterating while we find next header with greatest or same level
            while j < len(sentences) and not next_header_found:

                s2 = sentences[j]
                s2_header = is_header(s2)

                if s2_header:
                    print('  ' + s2_header[0])
                    if header[1] >= s2_header[1]:
                        # encounter header of higher level
                        next_header_found = True
                        print('X ' + s2_header[0])

                else:
                    # adding every sentence to the average
                    current_total += s2.score
                    current_count += 1
                
                j += 1

            if current_count != 0:
                s.score = current_total / current_count
            else:
                s.score = "NaN"

            s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'

            # stops at the references part
            if header[0] == "References" or header[0] == "See also":
                break

            new_sentences.append(s)

        # not a header
        else:
            s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
            new_sentences.append(s)

    return new_sentences


#   textrank
#   ------------------------------------------------------------------------

def txt2rankedsentences(txt):
    # from txt to ranked sentences
    return summarize(txt, split=True)


#   main
#   ------------------------------------------------------------------------

if __name__ == '__main__':


    # --- WIKI REQUEST ---

    # get text from wikipedia
    print('--- WIKI ---')
    page = wikipage(wikipedia_page)
    if not page:
        sys.exit("--- STOP ---")
    title = '<h1>'+page.title+'</h1>'
    text = page.content

    # print text in terminal
    print('--- TXT ---')
    print(text)


    # --- APPLY TEXTRANK ---

    # apply textrank
    sentences = txt2rankedsentences(text)

    # print ranked sentences in terminal
    print('--- SENTENCES ---')
    for s in sentences:
        print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))


    # --- REMAP AND COMPRESS ---

    # sorted version of the list
    sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
    # remap sentences from 0 to 1
    max_score = sorted_sentences[0].score
    min_score = sorted_sentences[-1].score
    sentences = [remap_score(s, min_score, max_score) for s in sentences]
    # compress scores (make more stuff invisible)
    sentences = [compress_score(s) for s in sentences]


    # -- PARSE ---

    # parse every sentences to either span or header
    sentences = wiki_parse(sentences)
    # add back page title
    sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences


    # -- TEMPLATING ---

    # getting the template
    with open(TEMPLATE_PATH, 'r') as file:
        template = Template(file.read())
    # render template
    html = template.render(sentences = sentences)
    with open(HTML_PATH, 'w') as file:
        file.write(html)
first experiment with opacity 2 years ago			`from jinja2 import Template`
			`import os`
			`import wikipedia`
			`from markdown import markdown`

			`# importing module`
			`import sys`

			`# appending a path`
			`# sys.path.append('textrank')`

			`# importing required module`
			`import summa.summarizer`
			`from summa.summarizer import summarize`


			`# TODO:`
			`# * DONE: wiki header`

			`# those 3 would ask to start from the HTML itself and keep and index...`
			`# * wiki paragraph`
			`# * wiki hyperlinks`
			`# * list`


			`# variables`
			`# ------------------------------------------------------------------------`

			`# wikipedia_page = "forest"`
			`# wikipedia_page = "warehouse"`
			`# wikipedia_page = "river"`
			`wikipedia_page = "elderflower"`
			`# wikipedia_page = "mushroom"`

			`TEMPLATE_PATH = 'template.html'`
			`HTML_PATH = 'www/index.html'`


			`# utilities`
			`# ------------------------------------------------------------------------`

			`def map_value(value, min, max, new_min, new_max):`
			`return (((value - min) / (max - min)) * (new_max - new_min)) + new_min`

			`def remap_score(s, min_score, max_score):`
			`s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)`
			`return s`

			`def compress_score(s):`

			`# compress whites`
			`s.score = s.score**3`

			`# stretch + limiter`
			`# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)`
			`s.score = 1 if s.score > 0.8 else s.score`

			`return s`


			`# wikipedia`
			`# ------------------------------------------------------------------------`

			`def wikipage(pagename):`
			`# get wikipedia page content by name of the page`

			`print(pagename)`
			`wikipedia.set_lang("en")`
			`try:`
			`results = wikipedia.search(pagename, results=1, suggestion=False)`
			`try:`
			`pagename = results[0]`
			`except IndexError:`
			`# if there is no suggestion or search results, the page doesn't exist`
			`raise wikipedia.PageError(pagename)`
			`return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)`
			`except wikipedia.exceptions.DisambiguationError as e:`
			`print(e.options)`
			`page = ''`

			`return page`


			`# parsing and gluing html`
			`# ------------------------------------------------------------------------`

			`def is_header(s):`

			`# i is the header level`
			`i = 0`
			`while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':`
			`i += 1`

			`if i > 0:`
			`header_text = s.text[i:(-1-i)].strip()`
			`header_level = i`
			`return [header_text, header_level]`

			`def wiki_parse(sentences):`

			`# TODO: doesn't work with section nesting!!`
			`# 1. replace wikitext header with html header`
			`# 2. add the opacity to each elements`
			`# 3. compute an artificial score for header that is an average of the score of the section`

			`new_sentences = []`

			`print('--- HEADERS ---')`
			`for i in range(len(sentences)):`

			`s = sentences[i]`

			`# if sentences is header`
			`header = is_header(s)`
			`if header:`
			`print(header[0])`

			`# start computing the average of score of this section`
			`current_total = 0`
			`current_count = 0`
			`next_header_found = False`
			`j = i + 1`

			`# iterating while we find next header with greatest or same level`
			`while j < len(sentences) and not next_header_found:`

			`s2 = sentences[j]`
			`s2_header = is_header(s2)`

			`if s2_header:`
			`print(' ' + s2_header[0])`
			`if header[1] >= s2_header[1]:`
			`# encounter header of higher level`
			`next_header_found = True`
			`print('X ' + s2_header[0])`

			`else:`
			`# adding every sentence to the average`
			`current_total += s2.score`
			`current_count += 1`

			`j += 1`

			`if current_count != 0:`
			`s.score = current_total / current_count`
			`else:`
			`s.score = "NaN"`

			`s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'`

			`# stops at the references part`
			`if header[0] == "References" or header[0] == "See also":`
			`break`

			`new_sentences.append(s)`

			`# not a header`
			`else:`
			`s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'`
			`new_sentences.append(s)`

			`return new_sentences`


			`# textrank`
			`# ------------------------------------------------------------------------`

			`def txt2rankedsentences(txt):`
			`# from txt to ranked sentences`
			`return summarize(txt, split=True)`


			`# main`
			`# ------------------------------------------------------------------------`

			`if __name__ == '__main__':`


			`# --- WIKI REQUEST ---`

			`# get text from wikipedia`
			`print('--- WIKI ---')`
			`page = wikipage(wikipedia_page)`
			`if not page:`
			`sys.exit("--- STOP ---")`
			`title = '<h1>'+page.title+'</h1>'`
			`text = page.content`

			`# print text in terminal`
			`print('--- TXT ---')`
			`print(text)`


			`# --- APPLY TEXTRANK ---`

			`# apply textrank`
			`sentences = txt2rankedsentences(text)`

			`# print ranked sentences in terminal`
			`print('--- SENTENCES ---')`
			`for s in sentences:`
			`print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))`


			`# --- REMAP AND COMPRESS ---`

			`# sorted version of the list`
			`sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)`
			`# remap sentences from 0 to 1`
			`max_score = sorted_sentences[0].score`
			`min_score = sorted_sentences[-1].score`
			`sentences = [remap_score(s, min_score, max_score) for s in sentences]`
			`# compress scores (make more stuff invisible)`
			`sentences = [compress_score(s) for s in sentences]`


			`# -- PARSE ---`

			`# parse every sentences to either span or header`
			`sentences = wiki_parse(sentences)`
			`# add back page title`
			`sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences`


			`# -- TEMPLATING ---`

			`# getting the template`
			`with open(TEMPLATE_PATH, 'r') as file:`
			`template = Template(file.read())`
			`# render template`
			`html = template.render(sentences = sentences)`
			`with open(HTML_PATH, 'w') as file:`
			`file.write(html)`