You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
2.4 KiB
Python

from jinja2 import Template
from markdown import markdown
import sys
# appending a path
sys.path.append('../')
# importing customised module
import summa.edits
from summa.edits import scored_sentences
import wikipage
from wikipage.page import get_wikipage, is_header
# variables
# ------------------------------------------------------------------------
# languages = ['en','fr']
# page_requests = ['river', 'rivière']
languages = ['en','fr', 'es']
languages_full = ['english', 'french', 'spanish']
page_requests = ['woman', 'femme', 'mujer']
# possible languages for summa
# "danish", "dutch", "english", "finnish", "french", "german",
# "hungarian", "italian", "norwegian", "porter", "portuguese",
# "romanian", "russian", "spanish", "swedish"
# according to doc: https://summanlp.github.io/textrank/
TEMPLATE_PATH = 'template.html'
HTML_PATH = 'www/index.html'
# utilities
# ------------------------------------------------------------------------
def page_request(request, lang):
# get text from wikipedia
print('--- WIKI ---')
page = get_wikipage(request, lang)
if not page:
sys.exit("--- STOP ---")
return page
# main
# ------------------------------------------------------------------------
if __name__ == '__main__':
processed_pages = []
for i in range(len(languages)):
# --- WIKI REQUEST ---
lang = languages[i]
lang_full = languages_full[i]
request = page_requests[i]
page = page_request(request, lang)
print("got " + page.title)
# add the lang
page.lang = lang
# --- APPLY TEXTRANK ---
sentences = scored_sentences(page.content, language = lang_full)
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
page.sentences = sorted_sentences
# for s in sorted_sentences[:5]:
# print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
# remove header
# page.content =
processed_pages.append(page)
# print('--- SENTENCES ---')
# for s in sorted_sentence:
# print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
# -- TEMPLATING ---
# getting the template
with open(TEMPLATE_PATH, 'r') as file:
template = Template(file.read())
# render template
html = template.render(pages = processed_pages)
with open(HTML_PATH, 'w') as file:
file.write(html)