You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

208 lines
5.4 KiB
Python

from jinja2 import Template
from markdown import markdown
import sys
# appending a path
sys.path.append('../')
# importing customised module
import summa.edits
from summa.edits import scored_sentences
import wikipage
from wikipage.page import get_wikipage
# TODO:
# * DONE: wiki header
# those 3 would ask to start from the HTML itself and keep and index...
# * wiki paragraph
# * wiki hyperlinks
# * list
# variables
# ------------------------------------------------------------------------
# wikipedia_page = "forest"
# wikipedia_page = "warehouse"
# wikipedia_page = "river"
wikipedia_page = "elderflower"
# wikipedia_page = "mushroom"
TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'
# utilities
# ------------------------------------------------------------------------
def map_value(value, min, max, new_min, new_max):
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
def remap_score(s, min_score, max_score):
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
return s
def compress_score(s):
# compress whites
s.score = s.score**3
# stretch + limiter
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
s.score = 1 if s.score > 0.8 else s.score
return s
# parsing and gluing html
# ------------------------------------------------------------------------
def is_header(s):
# i is the header level
i = 0
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
i += 1
if i > 0:
header_text = s.text[i:(-1-i)].strip()
header_level = i
return [header_text, header_level]
def wiki_parse(sentences):
# TODO: doesn't work with section nesting!!
# 1. replace wikitext header with html header
# 2. add the opacity to each elements
# 3. compute an artificial score for header that is an average of the score of the section
new_sentences = []
print('--- HEADERS ---')
for i in range(len(sentences)):
s = sentences[i]
# if sentences is header
header = is_header(s)
if header:
print(header[0])
# start computing the average of score of this section
current_total = 0
current_count = 0
next_header_found = False
j = i + 1
# iterating while we find next header with greatest or same level
while j < len(sentences) and not next_header_found:
s2 = sentences[j]
s2_header = is_header(s2)
if s2_header:
print(' ' + s2_header[0])
if header[1] >= s2_header[1]:
# encounter header of higher level
next_header_found = True
print('X ' + s2_header[0])
else:
# adding every sentence to the average
current_total += s2.score
current_count += 1
j += 1
if current_count != 0:
s.score = current_total / current_count
else:
s.score = "NaN"
s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
# stops at the references part
if header[0] == "References" or header[0] == "See also":
break
new_sentences.append(s)
# not a header
else:
s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
new_sentences.append(s)
return new_sentences
# textrank
# ------------------------------------------------------------------------
def txt2rankedsentences(txt):
# from txt to ranked sentences
return scored_sentences(txt, split=True)
# main
# ------------------------------------------------------------------------
if __name__ == '__main__':
# --- WIKI REQUEST ---
# get text from wikipedia
print('--- WIKI ---')
page = get_wikipage(wikipedia_page)
if not page:
sys.exit("--- STOP ---")
title = '<h1>'+page.title+'</h1>'
text = page.content
# print text in terminal
print('--- TXT ---')
print(text)
# --- APPLY TEXTRANK ---
# apply textrank
sentences = txt2rankedsentences(text)
# print ranked sentences in terminal
print('--- SENTENCES ---')
for s in sentences:
print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
# --- REMAP AND COMPRESS ---
# sorted version of the list
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
# remap sentences from 0 to 1
max_score = sorted_sentences[0].score
min_score = sorted_sentences[-1].score
sentences = [remap_score(s, min_score, max_score) for s in sentences]
# compress scores (make more stuff invisible)
sentences = [compress_score(s) for s in sentences]
# -- PARSE ---
# parse every sentences to either span or header
sentences = wiki_parse(sentences)
# add back page title
sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
# -- TEMPLATING ---
# getting the template
with open(TEMPLATE_PATH, 'r') as file:
template = Template(file.read())
# render template
html = template.render(sentences = sentences)
with open(HTML_PATH, 'w') as file:
file.write(html)