wikipage getter in a separate folder so its also usable from different experiments

3 years ago · 8ae06f04df
parent fcf80318be
commit 8ae06f04df
11 changed files with 115 additions and 36 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,12 @@

-## opacity experiment
+## edited summa (textrank)
+
+summa is a textrank python implementation (https://github.com/summanlp/textrank).
+it was modified under `summa/`, by adding an `summa/edits.py` files to create two new function, to access the internal process steps of textrank:
+1. `scored_sentences`: gives the list of all the sentences with their score.
+2. `similarity_graph`: gives the matrix of similarity of all the sentences in a text.
+
+## [EXP] opacity

 For any wikipedia page, show the text content but where every sentences has an opacity inversely proportional to its TextRank score.

@ -7,17 +14,20 @@ Meaning sentences considered as _"relevant to be included in a summary of the ar

 ### using

-* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score.
+* edited summa (https://github.com/summanlp/textrank)
 * wikipedia python module (https://pypi.org/project/wikipedia/)

 ### to use

 modify the variable `wikipedia_page` in `make.py` to whatever page then

+        cd exp.opacity
        python3 make.py

 ### technical notes

 * **headers opacities** where manually recomputed has average of their section, this is justified because otherwise their break the flow of the document (their shortness seems to either put them nearly full black or white otherwise, independantly of how textrank rank the paragraphs in their associated sections)
 * using the `.content` method of python wikipedia, we get **plain text plus header in wikitext**, but things like `<p>`, `<ul>`, `<blockquote>`, etc all dissapeared. see if we want to craft a version using the `.html` method of python wikipedia, but it becomes more complex because of sentence tokenisation, probably need an index to keep track of their original div nested location.
-* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?
+* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ?
+
+## [EXP] recommanded
--- a/exp.opacity/make.py
+++ b/exp.opacity/make.py
@ -1,18 +1,16 @@
 from jinja2 import Template
-import os
-import wikipedia
 from markdown import markdown
-
-# importing module
 import sys
  
 # appending a path
 sys.path.append('../')
  
-# importing required module
-import summa.summarizer
-from summa.summarizer import scored_sentences
+# importing customised module
+import summa.edits
+from summa.edits import scored_sentences

+import wikipage
+from wikipage.page import get_wikipage

 # TODO:
 # * DONE: wiki header
@ -57,30 +55,6 @@ def compress_score(s):

    return s

-
-#   wikipedia
-#   ------------------------------------------------------------------------
-
-def wikipage(pagename):
-    # get wikipedia page content by name of the page
-
-    print(pagename)
-    wikipedia.set_lang("en")
-    try:
-        results = wikipedia.search(pagename, results=1, suggestion=False)
-        try:
-            pagename = results[0]
-        except IndexError:
-            # if there is no suggestion or search results, the page doesn't exist
-            raise wikipedia.PageError(pagename)
-        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
-    except wikipedia.exceptions.DisambiguationError as e:
-        print(e.options)
-        page = ''
-
-    return page
-
-
 #   parsing and gluing html
 #   ------------------------------------------------------------------------

@ -180,7 +154,7 @@ if __name__ == '__main__':

    # get text from wikipedia
    print('--- WIKI ---')
-    page = wikipage(wikipedia_page)
+    page = get_wikipage(wikipedia_page)
    if not page:
        sys.exit("--- STOP ---")
    title = '<h1>'+page.title+'</h1>'
--- a/exp.recommanded/make.py
+++ b/exp.recommanded/make.py
@ -0,0 +1,56 @@
+from jinja2 import Template
+from markdown import markdown
+import sys
+  
+# appending a path
+sys.path.append('../')
+  
+# importing customised module
+import summa.edits
+from summa.edits import scored_sentences, similarity_graph
+
+import wikipage
+from wikipage.page import get_wikipage
+
+wikipedia_page = "mushroom"
+
+#   main
+#   ------------------------------------------------------------------------
+
+if __name__ == '__main__':
+
+    # --- WIKI REQUEST ---
+
+    # get text from wikipedia
+    print('--- WIKI ---')
+    page = get_wikipage(wikipedia_page)
+    if not page:
+        sys.exit("--- STOP ---")
+    title = '<h1>'+page.title+'</h1>'
+    text = page.content
+
+    # print text in terminal
+    print('--- TXT ---')
+    print(text)
+
+    # --- APPLY TEXTRANK ---
+
+    # apply textrank
+    graph = similarity_graph(text)
+
+    # print ranked sentences in terminal
+    print('--- GRAPH ---')
+
+
+    # for i in len(graph.nodes()):
+    #     for j in len(graph.nodes()):
+
+    #         s1 = graph.nodes()[i]
+    #         s2 = graph.nodes()[j]
+    #         weight = graph.edge_weight((i, j))
+
+    #         print('---')
+    #         print('1. ' + s1)
+    #         print('2. ' + s1)
+    #         print('similarity: ' + weight)
+
--- a/summa/init.pyc
+++ b/summa/init.pyc
--- a/summa/pycache/init.cpython-38.pyc
+++ b/summa/pycache/init.cpython-38.pyc
--- a/summa/pycache/edits.cpython-38.pyc
+++ b/summa/pycache/edits.cpython-38.pyc
--- a/summa/commons.pyc
+++ b/summa/commons.pyc
--- a/summa/edits.py
+++ b/summa/edits.py
@ -30,4 +30,18 @@ def scored_sentences(text, language="english", split=False, additional_stopwords
    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

-    return sentences
+    return sentences
+
+
+def similarity_graph(text, language="english", additional_stopwords=None):
+    if not isinstance(text, str):
+        raise ValueError("Text parameter must be a Unicode object (str)!")
+
+    # Gets a list of processed sentences.
+    sentences = _clean_text_by_sentences(text, language, additional_stopwords)
+
+    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
+    graph = _build_graph([sentence.token for sentence in sentences])
+    _set_graph_edge_weights(graph)
+
+    return graph
--- a/wikipage/pycache/init.cpython-38.pyc
+++ b/wikipage/pycache/init.cpython-38.pyc
--- a/wikipage/pycache/page.cpython-38.pyc
+++ b/wikipage/pycache/page.cpython-38.pyc
--- a/wikipage/page.py
+++ b/wikipage/page.py
@ -0,0 +1,25 @@
+import wikipedia
+
+wikipedia.set_lang("en")
+
+#   wikipedia
+#   ------------------------------------------------------------------------
+
+def get_wikipage(pagename):
+    # get wikipedia page content by name of the page
+
+    print(pagename)
+    try:
+        results = wikipedia.search(pagename, results=1, suggestion=False)
+        try:
+            pagename = results[0]
+        except IndexError:
+            # if there is no suggestion or search results, the page doesn't exist
+            raise wikipedia.PageError(pagename)
+        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
+    except wikipedia.exceptions.DisambiguationError as e:
+        print(e.options)
+        page = ''
+
+    return page
+