restructured folder by experiment
parent
efd8e77df9
commit
4a0bfb820d
@ -0,0 +1,233 @@
|
|||||||
|
from jinja2 import Template
|
||||||
|
import os
|
||||||
|
import wikipedia
|
||||||
|
from markdown import markdown
|
||||||
|
|
||||||
|
# importing module
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# appending a path
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
# importing required module
|
||||||
|
import summa.summarizer
|
||||||
|
from summa.summarizer import summarize
|
||||||
|
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# * DONE: wiki header
|
||||||
|
|
||||||
|
# those 3 would ask to start from the HTML itself and keep and index...
|
||||||
|
# * wiki paragraph
|
||||||
|
# * wiki hyperlinks
|
||||||
|
# * list
|
||||||
|
|
||||||
|
|
||||||
|
# variables
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# wikipedia_page = "forest"
|
||||||
|
# wikipedia_page = "warehouse"
|
||||||
|
# wikipedia_page = "river"
|
||||||
|
wikipedia_page = "elderflower"
|
||||||
|
# wikipedia_page = "mushroom"
|
||||||
|
|
||||||
|
TEMPLATE_PATH = 'template.html'
|
||||||
|
HTML_PATH = 'www/index.html'
|
||||||
|
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def map_value(value, min, max, new_min, new_max):
|
||||||
|
return (((value - min) / (max - min)) * (new_max - new_min)) + new_min
|
||||||
|
|
||||||
|
def remap_score(s, min_score, max_score):
|
||||||
|
s.score = 1 - map_value(s.score, min_score, max_score, 0, 1)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def compress_score(s):
|
||||||
|
|
||||||
|
# compress whites
|
||||||
|
s.score = s.score**3
|
||||||
|
|
||||||
|
# stretch + limiter
|
||||||
|
# s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1)
|
||||||
|
s.score = 1 if s.score > 0.8 else s.score
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# wikipedia
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def wikipage(pagename):
|
||||||
|
# get wikipedia page content by name of the page
|
||||||
|
|
||||||
|
print(pagename)
|
||||||
|
wikipedia.set_lang("en")
|
||||||
|
try:
|
||||||
|
results = wikipedia.search(pagename, results=1, suggestion=False)
|
||||||
|
try:
|
||||||
|
pagename = results[0]
|
||||||
|
except IndexError:
|
||||||
|
# if there is no suggestion or search results, the page doesn't exist
|
||||||
|
raise wikipedia.PageError(pagename)
|
||||||
|
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
|
||||||
|
except wikipedia.exceptions.DisambiguationError as e:
|
||||||
|
print(e.options)
|
||||||
|
page = ''
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# parsing and gluing html
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def is_header(s):
|
||||||
|
|
||||||
|
# i is the header level
|
||||||
|
i = 0
|
||||||
|
while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=':
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if i > 0:
|
||||||
|
header_text = s.text[i:(-1-i)].strip()
|
||||||
|
header_level = i
|
||||||
|
return [header_text, header_level]
|
||||||
|
|
||||||
|
def wiki_parse(sentences):
|
||||||
|
|
||||||
|
# TODO: doesn't work with section nesting!!
|
||||||
|
# 1. replace wikitext header with html header
|
||||||
|
# 2. add the opacity to each elements
|
||||||
|
# 3. compute an artificial score for header that is an average of the score of the section
|
||||||
|
|
||||||
|
new_sentences = []
|
||||||
|
|
||||||
|
print('--- HEADERS ---')
|
||||||
|
for i in range(len(sentences)):
|
||||||
|
|
||||||
|
s = sentences[i]
|
||||||
|
|
||||||
|
# if sentences is header
|
||||||
|
header = is_header(s)
|
||||||
|
if header:
|
||||||
|
print(header[0])
|
||||||
|
|
||||||
|
# start computing the average of score of this section
|
||||||
|
current_total = 0
|
||||||
|
current_count = 0
|
||||||
|
next_header_found = False
|
||||||
|
j = i + 1
|
||||||
|
|
||||||
|
# iterating while we find next header with greatest or same level
|
||||||
|
while j < len(sentences) and not next_header_found:
|
||||||
|
|
||||||
|
s2 = sentences[j]
|
||||||
|
s2_header = is_header(s2)
|
||||||
|
|
||||||
|
if s2_header:
|
||||||
|
print(' ' + s2_header[0])
|
||||||
|
if header[1] >= s2_header[1]:
|
||||||
|
# encounter header of higher level
|
||||||
|
next_header_found = True
|
||||||
|
print('X ' + s2_header[0])
|
||||||
|
|
||||||
|
else:
|
||||||
|
# adding every sentence to the average
|
||||||
|
current_total += s2.score
|
||||||
|
current_count += 1
|
||||||
|
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
if current_count != 0:
|
||||||
|
s.score = current_total / current_count
|
||||||
|
else:
|
||||||
|
s.score = "NaN"
|
||||||
|
|
||||||
|
s.html = '<h'+str(header[1])+' style="opacity:'+str(s.score)+';">'+header[0]+'</h'+str(header[1])+'>'
|
||||||
|
|
||||||
|
# stops at the references part
|
||||||
|
if header[0] == "References" or header[0] == "See also":
|
||||||
|
break
|
||||||
|
|
||||||
|
new_sentences.append(s)
|
||||||
|
|
||||||
|
# not a header
|
||||||
|
else:
|
||||||
|
s.html = '<span style="opacity:'+str(s.score)+';">'+s.text+'</span>'
|
||||||
|
new_sentences.append(s)
|
||||||
|
|
||||||
|
return new_sentences
|
||||||
|
|
||||||
|
|
||||||
|
# textrank
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def txt2rankedsentences(txt):
|
||||||
|
# from txt to ranked sentences
|
||||||
|
return summarize(txt, split=True)
|
||||||
|
|
||||||
|
|
||||||
|
# main
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|
||||||
|
# --- WIKI REQUEST ---
|
||||||
|
|
||||||
|
# get text from wikipedia
|
||||||
|
print('--- WIKI ---')
|
||||||
|
page = wikipage(wikipedia_page)
|
||||||
|
if not page:
|
||||||
|
sys.exit("--- STOP ---")
|
||||||
|
title = '<h1>'+page.title+'</h1>'
|
||||||
|
text = page.content
|
||||||
|
|
||||||
|
# print text in terminal
|
||||||
|
print('--- TXT ---')
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
|
||||||
|
# --- APPLY TEXTRANK ---
|
||||||
|
|
||||||
|
# apply textrank
|
||||||
|
sentences = txt2rankedsentences(text)
|
||||||
|
|
||||||
|
# print ranked sentences in terminal
|
||||||
|
print('--- SENTENCES ---')
|
||||||
|
for s in sentences:
|
||||||
|
print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text))
|
||||||
|
|
||||||
|
|
||||||
|
# --- REMAP AND COMPRESS ---
|
||||||
|
|
||||||
|
# sorted version of the list
|
||||||
|
sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True)
|
||||||
|
# remap sentences from 0 to 1
|
||||||
|
max_score = sorted_sentences[0].score
|
||||||
|
min_score = sorted_sentences[-1].score
|
||||||
|
sentences = [remap_score(s, min_score, max_score) for s in sentences]
|
||||||
|
# compress scores (make more stuff invisible)
|
||||||
|
sentences = [compress_score(s) for s in sentences]
|
||||||
|
|
||||||
|
|
||||||
|
# -- PARSE ---
|
||||||
|
|
||||||
|
# parse every sentences to either span or header
|
||||||
|
sentences = wiki_parse(sentences)
|
||||||
|
# add back page title
|
||||||
|
sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences
|
||||||
|
|
||||||
|
|
||||||
|
# -- TEMPLATING ---
|
||||||
|
|
||||||
|
# getting the template
|
||||||
|
with open(TEMPLATE_PATH, 'r') as file:
|
||||||
|
template = Template(file.read())
|
||||||
|
# render template
|
||||||
|
html = template.render(sentences = sentences)
|
||||||
|
with open(HTML_PATH, 'w') as file:
|
||||||
|
file.write(html)
|
@ -0,0 +1,31 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html lang="fr">
|
||||||
|
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
|
||||||
|
<title>TextRank Opacity</title>
|
||||||
|
<meta name="description" content="a call for 2 desks in studio 5 of the Meyboom artist-run spaces">
|
||||||
|
|
||||||
|
<link rel="stylesheet" type="text/css" href="css/main.css" />
|
||||||
|
<link rel="stylesheet" type="text/css" href="css/typography.css" />
|
||||||
|
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<main>
|
||||||
|
|
||||||
|
{% for s in sentences %}
|
||||||
|
{{ s.html|safe }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
</main>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
@ -0,0 +1 @@
|
|||||||
|
A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund.
|
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
:root{
|
||||||
|
--lh: 1.35rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
body{
|
||||||
|
margin: var(--lh);
|
||||||
|
line-height: var(--lh);
|
||||||
|
}
|
||||||
|
|
||||||
|
@media print{
|
||||||
|
body{
|
||||||
|
margin: 0;
|
||||||
|
font-size: 10pt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main{
|
||||||
|
max-width: 42rem;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* h1,h2,h3,h4,h5,h6{
|
||||||
|
line-height: var(--lh);
|
||||||
|
} */
|
||||||
|
|
||||||
|
h1{
|
||||||
|
text-align: center;
|
||||||
|
margin: calc(2 * var(--lh)) 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
h2,h3,h4,h5,h6{
|
||||||
|
margin: calc(3 * var(--lh)) 0 var(--lh);
|
||||||
|
}
|
||||||
|
|
||||||
|
:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){
|
||||||
|
margin-top: var(--lh);
|
||||||
|
}
|
Loading…
Reference in New Issue