You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
234 lines
6.0 KiB
Python
234 lines
6.0 KiB
Python
2 years ago
|
from jinja2 import Template
|
||
|
import os
|
||
|
import wikipedia
|
||
|
from markdown import markdown
|
||
|
|
||
|
# importing module
|
||
|
import sys
|
||
|
|
||
|
# appending a path
|
||
|
# sys.path.append('textrank')
|
||
|
|
||
|
# importing required module
|
||
|
import summa.summarizer
|
||
|
from summa.summarizer import summarize
|
||
|
|
||
|
|
||
|
# TODO:
|
||
|
# * DONE: wiki header
|
||
|
|
||
|
# those 3 would ask to start from the HTML itself and keep and index...
|
||
|
# * wiki paragraph
|
||
|
# * wiki hyperlinks
|
||
|
# * list
|
||
|
|
||
|
|
||
|
# variables
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
# wikipedia_page = "forest"
|
||
|
# wikipedia_page = "warehouse"
|
||
|
# wikipedia_page = "river"
|
||
|
wikipedia_page = "elderflower"
|
||
|
# wikipedia_page = "mushroom"
|
||
|
|
||
|
TEMPLATE_PATH = 'template.html'
|
||
|
HTML_PATH = 'www/index.html'
|
||
|
|
||
|
|
||
|
# utilities
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
def map_value(value, min, max, new_min, new_max):
|
||
|
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
|
||
|
|
||
|
def remap_score(s, min_score, max_score):
|
||
|
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
|
||
|
return s
|
||
|
|
||
|
def compress_score(s):
|
||
|
|
||
|
# compress whites
|
||
|
s.score = s.score**3
|
||
|
|
||
|
# stretch + limiter
|
||
|
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
|
||
|
s.score = 1 if s.score > 0.8 else s.score
|
||
|
|
||
|
return s
|
||
|
|
||
|
|
||
|
# wikipedia
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
def wikipage(pagename):
|
||
|
# get wikipedia page content by name of the page
|
||
|
|
||
|
print(pagename)
|
||
|
wikipedia.set_lang("en")
|
||
|
try:
|
||
|
results = wikipedia.search(pagename, results=1, suggestion=False)
|
||
|
try:
|
||
|
pagename = results[0]
|
||
|
except IndexError:
|
||
|
# if there is no suggestion or search results, the page doesn't exist
|
||
|
raise wikipedia.PageError(pagename)
|
||
|
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
|
||
|
except wikipedia.exceptions.DisambiguationError as e:
|
||
|
print(e.options)
|
||
|
page = ''
|
||
|
|
||
|
return page
|
||
|
|
||
|
|
||
|
# parsing and gluing html
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
def is_header(s):
|
||
|
|
||
|
# i is the header level
|
||
|
i = 0
|
||
|
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
|
||
|
i += 1
|
||
|
|
||
|
if i > 0:
|
||
|
header_text = s.text[i:(-1-i)].strip()
|
||
|
header_level = i
|
||
|
return [header_text, header_level]
|
||
|
|
||
|
def wiki_parse(sentences):
|
||
|
|
||
|
# TODO: doesn't work with section nesting!!
|
||
|
# 1. replace wikitext header with html header
|
||
|
# 2. add the opacity to each elements
|
||
|
# 3. compute an artificial score for header that is an average of the score of the section
|
||
|
|
||
|
new_sentences = []
|
||
|
|
||
|
print('--- HEADERS ---')
|
||
|
for i in range(len(sentences)):
|
||
|
|
||
|
s = sentences[i]
|
||
|
|
||
|
# if sentences is header
|
||
|
header = is_header(s)
|
||
|
if header:
|
||
|
print(header[0])
|
||
|
|
||
|
# start computing the average of score of this section
|
||
|
current_total = 0
|
||
|
current_count = 0
|
||
|
next_header_found = False
|
||
|
j = i + 1
|
||
|
|
||
|
# iterating while we find next header with greatest or same level
|
||
|
while j < len(sentences) and not next_header_found:
|
||
|
|
||
|
s2 = sentences[j]
|
||
|
s2_header = is_header(s2)
|
||
|
|
||
|
if s2_header:
|
||
|
print(' ' + s2_header[0])
|
||
|
if header[1] >= s2_header[1]:
|
||
|
# encounter header of higher level
|
||
|
next_header_found = True
|
||
|
print('X ' + s2_header[0])
|
||
|
|
||
|
else:
|
||
|
# adding every sentence to the average
|
||
|
current_total += s2.score
|
||
|
current_count += 1
|
||
|
|
||
|
j += 1
|
||
|
|
||
|
if current_count != 0:
|
||
|
s.score = current_total / current_count
|
||
|
else:
|
||
|
s.score = "NaN"
|
||
|
|
||
|
s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
|
||
|
|
||
|
# stops at the references part
|
||
|
if header[0] == "References" or header[0] == "See also":
|
||
|
break
|
||
|
|
||
|
new_sentences.append(s)
|
||
|
|
||
|
# not a header
|
||
|
else:
|
||
|
s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
|
||
|
new_sentences.append(s)
|
||
|
|
||
|
return new_sentences
|
||
|
|
||
|
|
||
|
# textrank
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
def txt2rankedsentences(txt):
|
||
|
# from txt to ranked sentences
|
||
|
return summarize(txt, split=True)
|
||
|
|
||
|
|
||
|
# main
|
||
|
# ------------------------------------------------------------------------
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
|
||
|
# --- WIKI REQUEST ---
|
||
|
|
||
|
# get text from wikipedia
|
||
|
print('--- WIKI ---')
|
||
|
page = wikipage(wikipedia_page)
|
||
|
if not page:
|
||
|
sys.exit("--- STOP ---")
|
||
|
title = '<h1>'+page.title+'</h1>'
|
||
|
text = page.content
|
||
|
|
||
|
# print text in terminal
|
||
|
print('--- TXT ---')
|
||
|
print(text)
|
||
|
|
||
|
|
||
|
# --- APPLY TEXTRANK ---
|
||
|
|
||
|
# apply textrank
|
||
|
sentences = txt2rankedsentences(text)
|
||
|
|
||
|
# print ranked sentences in terminal
|
||
|
print('--- SENTENCES ---')
|
||
|
for s in sentences:
|
||
|
print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
|
||
|
|
||
|
|
||
|
# --- REMAP AND COMPRESS ---
|
||
|
|
||
|
# sorted version of the list
|
||
|
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
|
||
|
# remap sentences from 0 to 1
|
||
|
max_score = sorted_sentences[0].score
|
||
|
min_score = sorted_sentences[-1].score
|
||
|
sentences = [remap_score(s, min_score, max_score) for s in sentences]
|
||
|
# compress scores (make more stuff invisible)
|
||
|
sentences = [compress_score(s) for s in sentences]
|
||
|
|
||
|
|
||
|
# -- PARSE ---
|
||
|
|
||
|
# parse every sentences to either span or header
|
||
|
sentences = wiki_parse(sentences)
|
||
|
# add back page title
|
||
|
sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
|
||
|
|
||
|
|
||
|
# -- TEMPLATING ---
|
||
|
|
||
|
# getting the template
|
||
|
with open(TEMPLATE_PATH, 'r') as file:
|
||
|
template = Template(file.read())
|
||
|
# render template
|
||
|
html = template.render(sentences = sentences)
|
||
|
with open(HTML_PATH, 'w') as file:
|
||
|
file.write(html)
|