add setup.py

This commit is contained in:
emdee 2022-11-16 18:33:59 +00:00
parent 1d92e0ec65
commit c6a7d839d9
5 changed files with 84 additions and 81 deletions

View File

@ -13,4 +13,5 @@ try:
vsetup_logging(log_level, logfile='', stream=sys.stderr) vsetup_logging(log_level, logfile='', stream=sys.stderr)
except: pass except: pass
iMain(sys.argv[1:], bgui=False) if __name__ == '__main__':
iMain(sys.argv[1:], bgui=False)

View File

@ -1,9 +1,14 @@
#!/usr/local/bin/python3.sh #!/usr/local/bin/python3.sh
# -*-mode: python; indent-tabs-mode: nil; py-indent-offset: 4; coding: utf-8 -* # -*-mode: python; indent-tabs-mode: nil; py-indent-offset: 4; coding: utf-8 -*
# Looks for urls https://dns.google/resolve? """
# and parses them to extract a magic field. Looks for urls https://dns.google/resolve?
# https://dns.google/resolve?name=domain.name&type=TXT&cd=true&do=true https://dns.google/resolve?name=domain.name&type=TXT&cd=true&do=true
and parses them to extract a magic field.
A good example of how you can parse json embedded in HTML with phantomjs.
"""
import sys import sys
import os import os
@ -17,7 +22,7 @@ warnings.filterwarnings('ignore')
LOG = logging.getLogger() LOG = logging.getLogger()
class LookFor(Render): class LookFor(Render):
def __init__(self, app, do_print=True, do_save=False): def __init__(self, app, do_print=True, do_save=False):
app.lfps = [] app.lfps = []
self._app = app self._app = app
@ -37,7 +42,7 @@ class LookFor(Render):
fp = fp[:i] fp = fp[:i]
# threadsafe? # threadsafe?
self._app.lfps.append(fp) self._app.lfps.append(fp)
def _html_callback(self, *args): def _html_callback(self, *args):
"""print(self, QPrinter, Callable[[bool], None])""" """print(self, QPrinter, Callable[[bool], None])"""
if type(args[0]) is str: if type(args[0]) is str:
@ -72,8 +77,8 @@ class LookFor(Render):
self.we_run_this_tor_relay = False self.we_run_this_tor_relay = False
LOG.warn(f"BAD {self.uri}") LOG.warn(f"BAD {self.uri}")
return 2 return 2
def _loadFinished(self, result): def _loadFinished(self, result):
LOG.debug(f"phantom.py: Loading finished {self.uri}") LOG.debug(f"phantom.py: Loading finished {self.uri}")
self.toHtml(self._html_callback) self.toHtml(self._html_callback)

View File

@ -13,8 +13,8 @@ replacement for other bulky headless browser frameworks.
If you have a display attached: If you have a display attached:
./phantom.py [--pdf_output <pdf-file>] [--js_input <javascript-file>] <url-or-html-file> ./phantom.py [--pdf_output <pdf-file>] [--js_input <javascript-file>] <url-or-html-file>
If you don't have a display attached (i.e. on a remote server), you can use If you don't have a display attached (i.e. on a remote server), you can use
xvfb-run, or don't add --show_gui - it should work without a display. xvfb-run, or don't add --show_gui - it should work without a display.
@ -64,7 +64,7 @@ CSS @media types, etc.
Installation of dependencies in Debian Stretch is easy: Installation of dependencies in Debian Stretch is easy:
apt-get install xvfb python3-pyqt5 python3-pyqt5.qtwebkit apt-get install xvfb python3-pyqt5 python3-pyqt5.qtwebkit
Finding the equivalent for other OSes is an exercise that I leave to you. Finding the equivalent for other OSes is an exercise that I leave to you.
@ -80,16 +80,16 @@ Given the following file /tmp/test.html
document.getElementById('id1').innerHTML = "bar"; document.getElementById('id1').innerHTML = "bar";
</script> </script>
</html> </html>
... and the following file /tmp/test.js: ... and the following file /tmp/test.js:
document.getElementById('id2').innerHTML = "baz"; document.getElementById('id2').innerHTML = "baz";
console.log("__PHANTOM_PY_DONE__"); console.log("__PHANTOM_PY_DONE__");
... and running this script (without attached display) ... ... and running this script (without attached display) ...
xvfb-run python3 phantom.py /tmp/test.html /tmp/out.pdf /tmp/test.js xvfb-run python3 phantom.py /tmp/test.html /tmp/out.pdf /tmp/test.js
... you will get a PDF file /tmp/out.pdf with the contents "foo bar baz". ... you will get a PDF file /tmp/out.pdf with the contents "foo bar baz".
Note that the second occurrence of "foo" has been replaced by the web page's own Note that the second occurrence of "foo" has been replaced by the web page's own
@ -130,8 +130,6 @@ from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter from PyQt5.QtPrintSupport import QPrinter
from PyQt5.QtWebEngineWidgets import QWebEnginePage from PyQt5.QtWebEngineWidgets import QWebEnginePage
from support_phantompy import vsetup_logging
global LOG global LOG
import logging import logging
import warnings import warnings
@ -161,19 +159,19 @@ def prepare(sdir='/tmp'):
</html> </html>
""") """)
LOG.debug(f"wrote {sfile} ") LOG.debug(f"wrote {sfile} ")
class Render(QWebEnginePage): class Render(QWebEnginePage):
def __init__(self, app, do_print=False, do_save=True): def __init__(self, app, do_print=False, do_save=True):
app.ldone = [] app.ldone = []
self._app = app self._app = app
self.do_print = do_print self.do_print = do_print
self.do_save = do_save self.do_save = do_save
self.percent = 0 self.percent = 0
self.uri = None self.uri = None
self.jsfile = None self.jsfile = None
self.htmlfile = None self.htmlfile = None
self.pdffile = None self.pdffile = None
QWebEnginePage.__init__(self) QWebEnginePage.__init__(self)
def run(self, url, pdffile, htmlfile, jsfile): def run(self, url, pdffile, htmlfile, jsfile):
self._app.lstart.append(id(self)) self._app.lstart.append(id(self))
@ -184,64 +182,65 @@ class Render(QWebEnginePage):
self.pdffile = pdffile self.pdffile = pdffile
self.outfile = pdffile or htmlfile self.outfile = pdffile or htmlfile
LOG.debug(f"phantom.py: URL={url} OUTFILE={outfile} JSFILE={jsfile}") LOG.debug(f"phantom.py: URL={url} OUTFILE={outfile} JSFILE={jsfile}")
qurl = QUrl.fromUserInput(url) qurl = QUrl.fromUserInput(url)
# The PDF generation only happens when the special string __PHANTOM_PY_DONE__ # The PDF generation only happens when the special string __PHANTOM_PY_DONE__
# is sent to console.log(). The following JS string will be executed by # is sent to console.log(). The following JS string will be executed by
# default, when no external JavaScript file is specified. # default, when no external JavaScript file is specified.
self.js_contents = "setTimeout(function() { console.log('__PHANTOM_PY_DONE__') }, 5000);"; self.js_contents = "setTimeout(function() { console.log('__PHANTOM_PY_DONE__') }, 5000);";
if jsfile: if jsfile:
try: try:
with open(self.jsfile, 'rt') as f: with open(self.jsfile, 'rt') as f:
self.js_contents = f.read() self.js_contents = f.read()
except Exception as e: except Exception as e:
LOG.exception(f"error reading jsfile {self.jsfile}") LOG.exception(f"error reading jsfile {self.jsfile}")
self.loadFinished.connect(self._loadFinished) self.loadFinished.connect(self._loadFinished)
self.percent = 20 self.percent = 20
self.load(qurl) self.load(qurl)
self.javaScriptConsoleMessage = self._onConsoleMessage self.javaScriptConsoleMessage = self._onConsoleMessage
LOG.debug(f"phantom.py: loading 10") LOG.debug(f"phantom.py: loading 10")
def _onConsoleMessage(self, *args): def _onConsoleMessage(self, *args):
if len(args) > 3: if len(args) > 3:
level, txt, lineno, filename = args level, txt, lineno, filename = args
else: else:
level = 1 level = 1
txt, lineno, filename = args txt, lineno, filename = args
LOG.debug(f"CONSOLE {lineno} {txt} {filename}") LOG.debug(f"CONSOLE {lineno} {txt} {filename}")
if "__PHANTOM_PY_DONE__" in txt: if "__PHANTOM_PY_DONE__" in txt:
self.percent = 40 self.percent = 40
# If we get this magic string, it means that the external JS is done # If we get this magic string, it means that the external JS is done
if self.do_save: if self.do_save:
self.toHtml(self._html_callback) self.toHtml(self._html_callback)
return return
# drop through # drop through
txt = "__PHANTOM_PY_SAVED__" txt = "__PHANTOM_PY_SAVED__"
if "__PHANTOM_PY_SAVED__" in txt: if "__PHANTOM_PY_SAVED__" in txt:
self.percent = 50 self.percent = 50
if self.do_print: if self.do_print:
self._print() self._print()
return return
txt = "__PHANTOM_PY_PRINTED__" txt = "__PHANTOM_PY_PRINTED__"
if "__PHANTOM_PY_PRINTED__" in txt: if "__PHANTOM_PY_PRINTED__" in txt:
self.percent = 60 self.percent = 60
self._exit(level) self._exit(level)
def _loadFinished(self, result): def _loadFinished(self, result):
self.percent = 30 # RenderProcessTerminationStatus ?
LOG.info(f"phantom.py: _loadFinished {result} {self.percent}") self.percent = 30
LOG.debug(f"phantom.py: Evaluating JS from {self.jsfile}") LOG.info(f"phantom.py: _loadFinished {result} {self.percent}")
self.runJavaScript("document.documentElement.contentEditable=true") LOG.debug(f"phantom.py: Evaluating JS from {self.jsfile}")
self.runJavaScript(self.js_contents) self.runJavaScript("document.documentElement.contentEditable=true")
self.runJavaScript(self.js_contents)
def _html_callback(self, *args): def _html_callback(self, *args):
"""print(self, QPrinter, Callable[[bool], None])""" """print(self, QPrinter, Callable[[bool], None])"""
if type(args[0]) is str: if type(args[0]) is str:
self._save(args[0]) self._save(args[0])
self._onConsoleMessage(0, "__PHANTOM_PY_SAVED__", 0 , '') self._onConsoleMessage(0, "__PHANTOM_PY_SAVED__", 0 , '')
def _save(self, html): def _save(self, html):
sfile = self.htmlfile sfile = self.htmlfile
# CompleteHtmlSaveFormat SingleHtmlSaveFormat MimeHtmlSaveFormat # CompleteHtmlSaveFormat SingleHtmlSaveFormat MimeHtmlSaveFormat
@ -267,7 +266,7 @@ class Render(QWebEnginePage):
printer.setOutputFileName(sfile) printer.setOutputFileName(sfile)
self.print(printer, self._printer_callback) self.print(printer, self._printer_callback)
LOG.debug("phantom.py: Printed") LOG.debug("phantom.py: Printed")
def _exit(self, val): def _exit(self, val):
self.percent = 100 self.percent = 100
LOG.debug(f"phantom.py: Exiting with val {val}") LOG.debug(f"phantom.py: Exiting with val {val}")

View File

@ -13,6 +13,7 @@ from PyQt5.QtWidgets import (QProgressBar, QWidget, QVBoxLayout)
from phantompy import Render from phantompy import Render
# from lookupdns import LookFor as Render # from lookupdns import LookFor as Render
from support_phantompy import vsetup_logging, omain_argparser
global LOG global LOG
import logging import logging
@ -35,7 +36,7 @@ class Widget(QtWidgets.QWidget):
i = len(asyncio.all_tasks()) i = len(asyncio.all_tasks())
self._label.setText(str(i)) self._label.setText(str(i))
self.progress.setValue(int(text)) self.progress.setValue(int(text))
class ContextManager: class ContextManager:
def __init__(self) -> None: def __init__(self) -> None:
self._seconds = 0 self._seconds = 0
@ -63,25 +64,22 @@ async def main(widget, app, ilen):
app.exit() app.exit()
# raise asyncio.CancelledError # raise asyncio.CancelledError
return return
LOG.debug(f"{app.ldone} {perc} {seconds}") LOG.debug(f"{app.ldone} {seconds}")
except asyncio.CancelledError as ex: except asyncio.CancelledError as ex:
LOG.debug("Task cancelled") LOG.debug("Task cancelled")
def iMain(largs): def iMain(largs):
parser = oMainArgparser() parser = omain_argparser()
oargs = parser.parse_args(lArgs) oargs = parser.parse_args(largs)
bgui=oargs.show_gui bgui=oargs.show_gui
try: try:
from support_phantompy import vsetup_logging
d = int(os.environ.get('DEBUG', 0)) d = int(os.environ.get('DEBUG', 0))
if d > 0: if d > 0:
vsetup_logging(10, stream=sys.stderr) oargs.log_level = 10
else: vsetup_logging(oargs.log_level, logfile='', stream=sys.stderr)
vsetup_logging(oargs.log_level, stream=sys.stderr)
vsetup_logging(log_level, logfile='', stream=sys.stderr)
except: pass except: pass
app = QtWidgets.QApplication([]) app = QtWidgets.QApplication([])
app.lstart = [] app.lstart = []
if bgui: if bgui:
@ -90,7 +88,7 @@ def iMain(largs):
widget.show() widget.show()
else: else:
widget = None widget = None
loop = qasync.QEventLoop(app) loop = qasync.QEventLoop(app)
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -105,9 +103,9 @@ def iMain(largs):
uri = url.strip() uri = url.strip()
r.run(uri, pdffile, htmlfile, jsfile) r.run(uri, pdffile, htmlfile, jsfile)
LOG.debug(f"{r.percent} {app.lstart}") LOG.debug(f"{r.percent} {app.lstart}")
LOG.info(f"queued {len(app.lstart)} urls") LOG.info(f"queued {len(app.lstart)} urls")
task = loop.create_task(main(widget, app, 1)) task = loop.create_task(main(widget, app, 1))
loop.run_forever() loop.run_forever()
@ -117,6 +115,6 @@ def iMain(largs):
loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(asyncio.gather(*tasks))
if __name__ == '__main__': if __name__ == '__main__':
iMain(sys.argv[1:]) iMain(sys.argv[1:])

View File

@ -80,7 +80,7 @@ def vsetup_logging(log_level, logfile='', stream=sys.stdout):
'NOTSET': logging.NOTSET, 'NOTSET': logging.NOTSET,
} }
def omain__argparser(_=None): def omain_argparser(_=None):
try: try:
from OpenSSL import SSL from OpenSSL import SSL
@ -106,9 +106,9 @@ def omain__argparser(_=None):
help="Operate on the HTML file with javascript") help="Operate on the HTML file with javascript")
parser.add_argument('--html_output', type=str, default='', parser.add_argument('--html_output', type=str, default='',
help="Write loaded and javascripted result to a HTML file") help="Write loaded and javascripted result to a HTML file")
parser.add_argument('--pdf_output', type=str, default=''), parser.add_argument('--pdf_output', type=str, default='',
help="Write loaded and javascripted result to a PDF file") help="Write loaded and javascripted result to a PDF file")
parser.add_argument('--show_gui', type=bool, store_action=True), parser.add_argument('--show_gui', type=bool, default=False, store_action=True),
help="show a progress meter that doesn't work") help="show a progress meter that doesn't work")
parser.add_argument('html_url', type=str, nargs='?', parser.add_argument('html_url', type=str, nargs='?',
required=True, required=True,