wikipage getter in a separate folder so its also usable from different experiments

3 years ago · 8ae06f04df
parent fcf80318be
commit 8ae06f04df
11 changed files with 115 additions and 36 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,12 @@
-## opacity experiment
+## edited summa (textrank)
 summa is a textrank python implementation (https://github.com/summanlp/textrank).
 it was modified under `summa/`, by adding an `summa/edits.py` files to create two new function, to access the internal process steps of textrank:
 1. `scored_sentences`: gives the list of all the sentences with their score.
 2. `similarity_graph`: gives the matrix of similarity of all the sentences in a text.
 ## [EXP] opacity
 For any wikipedia page, show the text content but where every sentences has an opacity inversely proportional to its TextRank score.
@ -7,17 +14,20 @@ Meaning sentences considered as _"relevant to be included in a summary of the ar
 ### using
-* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score.
+* edited summa (https://github.com/summanlp/textrank)
 * wikipedia python module (https://pypi.org/project/wikipedia/)
 ### to use
 modify the variable `wikipedia_page` in `make.py` to whatever page then
        cd exp.opacity
        python3 make.py
 ### technical notes
 * **headers opacities** where manually recomputed has average of their section, this is justified because otherwise their break the flow of the document (their shortness seems to either put them nearly full black or white otherwise, independantly of how textrank rank the paragraphs in their associated sections)
 * using the `.content` method of python wikipedia, we get **plain text plus header in wikitext**, but things like `<p>`, `<ul>`, `<blockquote>`, etc all dissapeared. see if we want to craft a version using the `.html` method of python wikipedia, but it becomes more complex because of sentence tokenisation, probably need an index to keep track of their original div nested location.
-* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?
+* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?
 ## [EXP] recommanded
--- a/exp.opacity/make.py
+++ b/exp.opacity/make.py
@ -1,18 +1,16 @@
 from jinja2 import Template
 import os
 import wikipedia
 from markdown import markdown
 # importing module
 import sys
 # appending a path
 sys.path.append('../')
-# importing required module
+# importing customised module
-import summa.summarizer
+import summa.edits
-from summa.summarizer import scored_sentences
+from summa.edits import scored_sentences
 import wikipage
 from wikipage.page import get_wikipage
 # TODO:
 # * DONE: wiki header
@ -57,30 +55,6 @@ def compress_score(s):
    return s
 #   wikipedia
 #   ------------------------------------------------------------------------
 def wikipage(pagename):
    # get wikipedia page content by name of the page
    print(pagename)
    wikipedia.set_lang("en")
    try:
        results = wikipedia.search(pagename, results=1, suggestion=False)
        try:
            pagename = results[0]
        except IndexError:
            # if there is no suggestion or search results, the page doesn't exist
            raise wikipedia.PageError(pagename)
        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        page = ''
    return page
 #   parsing and gluing html
 #   ------------------------------------------------------------------------
@ -180,7 +154,7 @@ if __name__ == '__main__':
    # get text from wikipedia
    print('--- WIKI ---')
-    page = wikipage(wikipedia_page)
+    page = get_wikipage(wikipedia_page)
    if not page:
        sys.exit("--- STOP ---")
    title = '<h1>'+page.title+'</h1>'
--- a/exp.recommanded/make.py
+++ b/exp.recommanded/make.py
@ -0,0 +1,56 @@
 from jinja2 import Template
 from markdown import markdown
 import sys
 # appending a path
 sys.path.append('../')
 # importing customised module
 import summa.edits
 from summa.edits import scored_sentences, similarity_graph
 import wikipage
 from wikipage.page import get_wikipage
 wikipedia_page = "mushroom"
 #   main
 #   ------------------------------------------------------------------------
 if __name__ == '__main__':
    # --- WIKI REQUEST ---
    # get text from wikipedia
    print('--- WIKI ---')
    page = get_wikipage(wikipedia_page)
    if not page:
        sys.exit("--- STOP ---")
    title = '<h1>'+page.title+'</h1>'
    text = page.content
    # print text in terminal
    print('--- TXT ---')
    print(text)
    # --- APPLY TEXTRANK ---
    # apply textrank
    graph = similarity_graph(text)
    # print ranked sentences in terminal
    print('--- GRAPH ---')
    # for i in len(graph.nodes()):
    #     for j in len(graph.nodes()):
    #         s1 = graph.nodes()[i]
    #         s2 = graph.nodes()[j]
    #         weight = graph.edge_weight((i, j))
    #         print('---')
    #         print('1. ' + s1)
    #         print('2. ' + s1)
    #         print('similarity: ' + weight)
--- a/summa/init.pyc
+++ b/summa/init.pyc
--- a/summa/pycache/init.cpython-38.pyc
+++ b/summa/pycache/init.cpython-38.pyc
--- a/summa/pycache/edits.cpython-38.pyc
+++ b/summa/pycache/edits.cpython-38.pyc
--- a/summa/commons.pyc
+++ b/summa/commons.pyc
--- a/summa/edits.py
+++ b/summa/edits.py
@ -30,4 +30,18 @@ def scored_sentences(text, language="english", split=False, additional_stopwords
    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)
-    return sentences
+    return sentences
 def similarity_graph(text, language="english", additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language, additional_stopwords)
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)
    return graph
--- a/wikipage/pycache/init.cpython-38.pyc
+++ b/wikipage/pycache/init.cpython-38.pyc
--- a/wikipage/pycache/page.cpython-38.pyc
+++ b/wikipage/pycache/page.cpython-38.pyc
--- a/wikipage/page.py
+++ b/wikipage/page.py
@ -0,0 +1,25 @@
 import wikipedia
 wikipedia.set_lang("en")
 #   wikipedia
 #   ------------------------------------------------------------------------
 def get_wikipage(pagename):
    # get wikipedia page content by name of the page
    print(pagename)
    try:
        results = wikipedia.search(pagename, results=1, suggestion=False)
        try:
            pagename = results[0]
        except IndexError:
            # if there is no suggestion or search results, the page doesn't exist
            raise wikipedia.PageError(pagename)
        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        page = ''
    return page