You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
2.4 KiB
Python
104 lines
2.4 KiB
Python
from jinja2 import Template
|
|
from markdown import markdown
|
|
import sys
|
|
|
|
# appending a path
|
|
sys.path.append('../')
|
|
|
|
# importing customised module
|
|
import summa.edits
|
|
from summa.edits import scored_sentences
|
|
|
|
import wikipage
|
|
from wikipage.page import get_wikipage, is_header
|
|
|
|
# variables
|
|
# ------------------------------------------------------------------------
|
|
|
|
|
|
# languages = ['en','fr']
|
|
# page_requests = ['river', 'rivière']
|
|
|
|
languages = ['en','fr', 'es']
|
|
languages_full = ['english', 'french', 'spanish']
|
|
page_requests = ['woman', 'femme', 'mujer']
|
|
|
|
# possible languages for summa
|
|
# "danish", "dutch", "english", "finnish", "french", "german",
|
|
# "hungarian", "italian", "norwegian", "porter", "portuguese",
|
|
# "romanian", "russian", "spanish", "swedish"
|
|
# according to doc: https://summanlp.github.io/textrank/
|
|
|
|
|
|
TEMPLATE_PATH = 'template.html'
|
|
HTML_PATH = 'www/index.html'
|
|
|
|
|
|
# utilities
|
|
# ------------------------------------------------------------------------
|
|
|
|
|
|
def page_request(request, lang):
|
|
|
|
# get text from wikipedia
|
|
print('--- WIKI ---')
|
|
page = get_wikipage(request, lang)
|
|
if not page:
|
|
sys.exit("--- STOP ---")
|
|
|
|
return page
|
|
|
|
|
|
# main
|
|
# ------------------------------------------------------------------------
|
|
|
|
if __name__ == '__main__':
|
|
|
|
processed_pages = []
|
|
|
|
for i in range(len(languages)):
|
|
|
|
# --- WIKI REQUEST ---
|
|
lang = languages[i]
|
|
lang_full = languages_full[i]
|
|
request = page_requests[i]
|
|
page = page_request(request, lang)
|
|
|
|
print("got " + page.title)
|
|
|
|
# add the lang
|
|
page.lang = lang
|
|
|
|
# --- APPLY TEXTRANK ---
|
|
sentences = scored_sentences(page.content, language = lang_full)
|
|
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
|
|
page.sentences = sorted_sentences
|
|
|
|
# for s in sorted_sentences[:5]:
|
|
# print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
|
|
|
|
# remove header
|
|
# page.content =
|
|
|
|
processed_pages.append(page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print('--- SENTENCES ---')
|
|
# for s in sorted_sentence:
|
|
# print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
|
|
|
|
|
|
# -- TEMPLATING ---
|
|
|
|
# getting the template
|
|
with open(TEMPLATE_PATH, 'r') as file:
|
|
template = Template(file.read())
|
|
# render template
|
|
html = template.render(pages = processed_pages)
|
|
with open(HTML_PATH, 'w') as file:
|
|
file.write(html)
|