from jinja2 import Template from markdown import markdown import sys # appending a path sys.path.append('../') # importing customised module import summa.edits from summa.edits import scored_sentences import wikipage from wikipage.page import get_wikipage # TODO: # * DONE: wiki header # those 3 would ask to start from the HTML itself and keep and index... # * wiki paragraph # * wiki hyperlinks # * list # variables # ------------------------------------------------------------------------ # wikipedia_page = "forest" # wikipedia_page = "warehouse" wikipedia_page = "river" # wikipedia_page = "elderflower" # wikipedia_page = "mushroom" TEMPLATE_PATH = 'template.html' HTML_PATH = 'www/index.html' # utilities # ------------------------------------------------------------------------ def map_value(value, min, max, new_min, new_max): return (((value - min) / (max - min)) * (new_max - new_min)) + new_min def remap_score(s, min_score, max_score): s.score = 1 - map_value(s.score, min_score, max_score, 0, 1) return s def compress_score(s): # compress whites s.score = s.score**3 # stretch + limiter # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1) s.score = 1 if s.score > 0.8 else s.score return s # parsing and gluing html # ------------------------------------------------------------------------ def is_header(s): # i is the header level i = 0 while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=': i += 1 if i > 0: header_text = s.text[i:(-1-i)].strip() header_level = i return [header_text, header_level] def wiki_parse(sentences): # TODO: doesn't work with section nesting!! # 1. replace wikitext header with html header # 2. add the opacity to each elements # 3. compute an artificial score for header that is an average of the score of the section new_sentences = [] print('--- HEADERS ---') for i in range(len(sentences)): s = sentences[i] # if sentences is header header = is_header(s) if header: print(header[0]) # start computing the average of score of this section current_total = 0 current_count = 0 next_header_found = False j = i + 1 # iterating while we find next header with greatest or same level while j < len(sentences) and not next_header_found: s2 = sentences[j] s2_header = is_header(s2) if s2_header: print(' ' + s2_header[0]) if header[1] >= s2_header[1]: # encounter header of higher level next_header_found = True print('X ' + s2_header[0]) else: # adding every sentence to the average current_total += s2.score current_count += 1 j += 1 if current_count != 0: s.score = current_total / current_count else: s.score = "NaN" s.html = ''+header[0]+'' # stops at the references part if header[0] == "References" or header[0] == "See also": break new_sentences.append(s) # not a header else: s.html = ''+s.text+'' new_sentences.append(s) return new_sentences # main # ------------------------------------------------------------------------ if __name__ == '__main__': # --- WIKI REQUEST --- # get text from wikipedia print('--- WIKI ---') page = get_wikipage(wikipedia_page) if not page: sys.exit("--- STOP ---") title = '

'+page.title+'

' text = page.content # print text in terminal print('--- TXT ---') print(text) # --- APPLY TEXTRANK --- # apply textrank sentences = scored_sentences(text) # print ranked sentences in terminal print('--- SENTENCES ---') for s in sentences: print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text)) # --- REMAP AND COMPRESS --- # sorted version of the list sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True) # remap sentences from 0 to 1 max_score = sorted_sentences[0].score min_score = sorted_sentences[-1].score sentences = [remap_score(s, min_score, max_score) for s in sentences] # compress scores (make more stuff invisible) sentences = [compress_score(s) for s in sentences] # -- PARSE --- # parse every sentences to either span or header sentences = wiki_parse(sentences) # add back page title sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences # -- TEMPLATING --- # getting the template with open(TEMPLATE_PATH, 'r') as file: template = Template(file.read()) # render template html = template.render(sentences = sentences) with open(HTML_PATH, 'w') as file: file.write(html)