diff --git a/README.md b/README.md index c61a8bb..50fc032 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ -## opacity experiment +## edited summa (textrank) + +summa is a textrank python implementation (https://github.com/summanlp/textrank). +it was modified under `summa/`, by adding an `summa/edits.py` files to create two new function, to access the internal process steps of textrank: +1. `scored_sentences`: gives the list of all the sentences with their score. +2. `similarity_graph`: gives the matrix of similarity of all the sentences in a text. + +## [EXP] opacity For any wikipedia page, show the text content but where every sentences has an opacity inversely proportional to its TextRank score. @@ -7,17 +14,20 @@ Meaning sentences considered as _"relevant to be included in a summary of the ar ### using -* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score. +* edited summa (https://github.com/summanlp/textrank) * wikipedia python module (https://pypi.org/project/wikipedia/) ### to use modify the variable `wikipedia_page` in `make.py` to whatever page then + cd exp.opacity python3 make.py ### technical notes * **headers opacities** where manually recomputed has average of their section, this is justified because otherwise their break the flow of the document (their shortness seems to either put them nearly full black or white otherwise, independantly of how textrank rank the paragraphs in their associated sections) * using the `.content` method of python wikipedia, we get **plain text plus header in wikitext**, but things like `
`, `
`, etc all dissapeared. see if we want to craft a version using the `.html` method of python wikipedia, but it becomes more complex because of sentence tokenisation, probably need an index to keep track of their original div nested location. -* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ? \ No newline at end of file +* **opacities were remapped** to add contrast to their curves. still need to experiment with that to find some kind of nice compromise on both paper and screen ? + +## [EXP] recommanded \ No newline at end of file diff --git a/exp.opacity/make.py b/exp.opacity/make.py index 4d9626f..3889316 100644 --- a/exp.opacity/make.py +++ b/exp.opacity/make.py @@ -1,18 +1,16 @@ from jinja2 import Template -import os -import wikipedia from markdown import markdown - -# importing module import sys # appending a path sys.path.append('../') -# importing required module -import summa.summarizer -from summa.summarizer import scored_sentences +# importing customised module +import summa.edits +from summa.edits import scored_sentences +import wikipage +from wikipage.page import get_wikipage # TODO: # * DONE: wiki header @@ -57,30 +55,6 @@ def compress_score(s): return s - -# wikipedia -# ------------------------------------------------------------------------ - -def wikipage(pagename): - # get wikipedia page content by name of the page - - print(pagename) - wikipedia.set_lang("en") - try: - results = wikipedia.search(pagename, results=1, suggestion=False) - try: - pagename = results[0] - except IndexError: - # if there is no suggestion or search results, the page doesn't exist - raise wikipedia.PageError(pagename) - return wikipedia.WikipediaPage(pagename, redirect=True, preload=True) - except wikipedia.exceptions.DisambiguationError as e: - print(e.options) - page = '' - - return page - - # parsing and gluing html # ------------------------------------------------------------------------ @@ -180,7 +154,7 @@ if __name__ == '__main__': # get text from wikipedia print('--- WIKI ---') - page = wikipage(wikipedia_page) + page = get_wikipage(wikipedia_page) if not page: sys.exit("--- STOP ---") title = ''+page.title+'
' diff --git a/exp.recommanded/make.py b/exp.recommanded/make.py new file mode 100644 index 0000000..d928084 --- /dev/null +++ b/exp.recommanded/make.py @@ -0,0 +1,56 @@ +from jinja2 import Template +from markdown import markdown +import sys + +# appending a path +sys.path.append('../') + +# importing customised module +import summa.edits +from summa.edits import scored_sentences, similarity_graph + +import wikipage +from wikipage.page import get_wikipage + +wikipedia_page = "mushroom" + +# main +# ------------------------------------------------------------------------ + +if __name__ == '__main__': + + # --- WIKI REQUEST --- + + # get text from wikipedia + print('--- WIKI ---') + page = get_wikipage(wikipedia_page) + if not page: + sys.exit("--- STOP ---") + title = ''+page.title+'
' + text = page.content + + # print text in terminal + print('--- TXT ---') + print(text) + + # --- APPLY TEXTRANK --- + + # apply textrank + graph = similarity_graph(text) + + # print ranked sentences in terminal + print('--- GRAPH ---') + + + # for i in len(graph.nodes()): + # for j in len(graph.nodes()): + + # s1 = graph.nodes()[i] + # s2 = graph.nodes()[j] + # weight = graph.edge_weight((i, j)) + + # print('---') + # print('1. ' + s1) + # print('2. ' + s1) + # print('similarity: ' + weight) + diff --git a/summa/__init__.pyc b/summa/__init__.pyc new file mode 100644 index 0000000..48d764e Binary files /dev/null and b/summa/__init__.pyc differ diff --git a/summa/__pycache__/__init__.cpython-38.pyc b/summa/__pycache__/__init__.cpython-38.pyc index f0224f0..7801e26 100644 Binary files a/summa/__pycache__/__init__.cpython-38.pyc and b/summa/__pycache__/__init__.cpython-38.pyc differ diff --git a/summa/__pycache__/edits.cpython-38.pyc b/summa/__pycache__/edits.cpython-38.pyc new file mode 100644 index 0000000..c80775c Binary files /dev/null and b/summa/__pycache__/edits.cpython-38.pyc differ diff --git a/summa/commons.pyc b/summa/commons.pyc new file mode 100644 index 0000000..8121443 Binary files /dev/null and b/summa/commons.pyc differ diff --git a/summa/edits.py b/summa/edits.py index ebb7b5d..b77fb96 100644 --- a/summa/edits.py +++ b/summa/edits.py @@ -30,4 +30,18 @@ def scored_sentences(text, language="english", split=False, additional_stopwords # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) - return sentences \ No newline at end of file + return sentences + + +def similarity_graph(text, language="english", additional_stopwords=None): + if not isinstance(text, str): + raise ValueError("Text parameter must be a Unicode object (str)!") + + # Gets a list of processed sentences. + sentences = _clean_text_by_sentences(text, language, additional_stopwords) + + # Creates the graph and calculates the similarity coefficient for every pair of nodes. + graph = _build_graph([sentence.token for sentence in sentences]) + _set_graph_edge_weights(graph) + + return graph \ No newline at end of file diff --git a/wikipage/__pycache__/__init__.cpython-38.pyc b/wikipage/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..1c40f12 Binary files /dev/null and b/wikipage/__pycache__/__init__.cpython-38.pyc differ diff --git a/wikipage/__pycache__/page.cpython-38.pyc b/wikipage/__pycache__/page.cpython-38.pyc new file mode 100644 index 0000000..6e1388a Binary files /dev/null and b/wikipage/__pycache__/page.cpython-38.pyc differ diff --git a/wikipage/page.py b/wikipage/page.py new file mode 100644 index 0000000..9dc510f --- /dev/null +++ b/wikipage/page.py @@ -0,0 +1,25 @@ +import wikipedia + +wikipedia.set_lang("en") + +# wikipedia +# ------------------------------------------------------------------------ + +def get_wikipage(pagename): + # get wikipedia page content by name of the page + + print(pagename) + try: + results = wikipedia.search(pagename, results=1, suggestion=False) + try: + pagename = results[0] + except IndexError: + # if there is no suggestion or search results, the page doesn't exist + raise wikipedia.PageError(pagename) + return wikipedia.WikipediaPage(pagename, redirect=True, preload=True) + except wikipedia.exceptions.DisambiguationError as e: + print(e.options) + page = '' + + return page +