from jinja2 import Template
import os
import wikipedia
from markdown import markdown
# importing module
import sys
# appending a path
sys.path.append('../')
# importing required module
import summa.summarizer
from summa.summarizer import summarize
# TODO:
# * DONE: wiki header
# those 3 would ask to start from the HTML itself and keep and index...
# * wiki paragraph
# * wiki hyperlinks
# * list
# variables
# ------------------------------------------------------------------------
# wikipedia_page = "forest"
# wikipedia_page = "warehouse"
# wikipedia_page = "river"
wikipedia_page = "elderflower"
# wikipedia_page = "mushroom"
TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'
# utilities
# ------------------------------------------------------------------------
def map_value(value, min, max, new_min, new_max):
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
def remap_score(s, min_score, max_score):
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
return s
def compress_score(s):
# compress whites
s.score = s.score**3
# stretch + limiter
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
s.score = 1 if s.score > 0.8 else s.score
return s
# wikipedia
# ------------------------------------------------------------------------
def wikipage(pagename):
# get wikipedia page content by name of the page
print(pagename)
wikipedia.set_lang("en")
try:
results = wikipedia.search(pagename, results=1, suggestion=False)
try:
pagename = results[0]
except IndexError:
# if there is no suggestion or search results, the page doesn't exist
raise wikipedia.PageError(pagename)
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
except wikipedia.exceptions.DisambiguationError as e:
print(e.options)
page = ''
return page
# parsing and gluing html
# ------------------------------------------------------------------------
def is_header(s):
# i is the header level
i = 0
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
i += 1
if i > 0:
header_text = s.text[i:(-1-i)].strip()
header_level = i
return [header_text, header_level]
def wiki_parse(sentences):
# TODO: doesn't work with section nesting!!
# 1. replace wikitext header with html header
# 2. add the opacity to each elements
# 3. compute an artificial score for header that is an average of the score of the section
new_sentences = []
print('--- HEADERS ---')
for i in range(len(sentences)):
s = sentences[i]
# if sentences is header
header = is_header(s)
if header:
print(header[0])
# start computing the average of score of this section
current_total = 0
current_count = 0
next_header_found = False
j = i + 1
# iterating while we find next header with greatest or same level
while j < len(sentences) and not next_header_found:
s2 = sentences[j]
s2_header = is_header(s2)
if s2_header:
print(' ' + s2_header[0])
if header[1] >= s2_header[1]:
# encounter header of higher level
next_header_found = True
print('X ' + s2_header[0])
else:
# adding every sentence to the average
current_total += s2.score
current_count += 1
j += 1
if current_count != 0:
s.score = current_total / current_count
else:
s.score = "NaN"
s.html = '