First
This commit is contained in:
parent
223339fc5d
commit
7e9f519835
28
LICENSE
28
LICENSE
@ -1,16 +1,20 @@
|
|||||||
Copyright (c) year copyright holder. All Rights Reserved.
|
# https://gist.githubusercontent.com/michaelfranzl/91f0cc13c56120391b949f885643e974/raw/a0601515e7a575bc4c7d4d2a20973b29b6c6f2df/phantom.py
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
Copyright 2017 Michael Karl Franzl
|
||||||
|
|
||||||
1.
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
Redistribution of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
of the Software, and to permit persons to whom the Software is furnished to do
|
||||||
|
so, subject to the following conditions:
|
||||||
|
|
||||||
2.
|
The above copyright notice and this permission notice shall be included in all
|
||||||
Redistribution in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
3.
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
YOU ACKNOWLEDGE THAT THIS SOFTWARE IS NOT DESIGNED, LICENSED OR INTENDED FOR USE IN THE DESIGN, CONSTRUCTION, OPERATION OR MAINTENANCE OF ANY MILITARY FACILITY.
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
92
README.md
92
README.md
@ -1,3 +1,93 @@
|
|||||||
# phantompy
|
# phantompy
|
||||||
|
|
||||||
A simple replacement for phantomjs using PyQt
|
A simple replacement for phantomjs using PyQt.
|
||||||
|
|
||||||
|
This code is based on a brilliant idea of
|
||||||
|
[Michael Franzl](https://gist.github.com/michaelfranzl/91f0cc13c56120391b949f885643e974/raw/a0601515e7a575bc4c7d4d2a20973b29b6c6f2df/phantom.py)
|
||||||
|
that he wrote up in his
|
||||||
|
[blog](https://blog.michael.franzl.name/2017/10/16/phantom-py/index.html)
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
* Generate a PDF screenshot of the web page after it is completely loaded.
|
||||||
|
* Optionally execute a local JavaScript file specified by the argument
|
||||||
|
```javascript-file``` after the web page is completely loaded, and before the
|
||||||
|
PDF is generated. (YMMV - it segfaults for me. )
|
||||||
|
* Generate a HTML save file screenshot of the web page after it is
|
||||||
|
completely loaded and the javascript has run.
|
||||||
|
* console.log’s will be printed to stdout.
|
||||||
|
* Easily add new features by changing the source code of this script,
|
||||||
|
without compiling C++ code. For more advanced applications, consider
|
||||||
|
attaching PyQt objects/methods to WebKit’s JavaScript space by using
|
||||||
|
QWebFrame::addToJavaScriptWindowObject().
|
||||||
|
|
||||||
|
If you execute an external ```javascript-file```, phantompy has no
|
||||||
|
way of knowing when that script has finished doing its work. For this
|
||||||
|
reason, the external script should execute at the end
|
||||||
|
```console.log("__PHANTOM_PY_DONE__");``` when done. This will trigger
|
||||||
|
the PDF generation or the file saving, after which phantompy will exit.
|
||||||
|
|
||||||
|
If no ```__PHANTOM_PY_DONE__``` string is seen on the console for 10
|
||||||
|
seconds, phantom.py will exit without doing anything. This behavior
|
||||||
|
could be implemented more elegantly without console.log’s but it is
|
||||||
|
the simplest solution.
|
||||||
|
|
||||||
|
It is important to remember that since you’re just running WebKit, you can
|
||||||
|
use everything that WebKit supports, including the usual JS client
|
||||||
|
libraries, CSS, CSS @media types, etc.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
* Python3
|
||||||
|
* PyQt5 (this should work with PySide2 and PyQt6 - let us know.)
|
||||||
|
* [qasyc](https://github.com/CabbageDevelopment/qasync) for the
|
||||||
|
standalone program ```qasync_lookup.py```
|
||||||
|
|
||||||
|
## Standalone
|
||||||
|
|
||||||
|
A standalone program is a little tricky as PyQt PyQt5.QtWebEngineWidgets'
|
||||||
|
QWebEnginePage uses callbacks at each step of the way:
|
||||||
|
1) loading the page = ```Render.run```
|
||||||
|
2) running javascript in and on the page = ```Render._loadFinished```
|
||||||
|
3) saving the page = ```Render.toHtml and _html_callback```
|
||||||
|
4) printing the page = ```Render._print```
|
||||||
|
|
||||||
|
The steps get chained by printing special messages to the Python
|
||||||
|
renderer of the JavaScript console: ```Render. _onConsoleMessage```
|
||||||
|
|
||||||
|
So it makes it hard if you want the standalone program to work without
|
||||||
|
a GUI, or in combination with another Qt program that is responsible
|
||||||
|
for the PyQt ```app.exec``` and the exiting of the program.
|
||||||
|
|
||||||
|
We've decided to use the best of the shims that merge the Python
|
||||||
|
```asyncio``` and Qt event loops:
|
||||||
|
[qasyc](https://github.com/CabbageDevelopment/qasync). This is seen as
|
||||||
|
the successor to the sorta abandonned[](https://github.com/harvimt/quamash).
|
||||||
|
The code is based on a
|
||||||
|
[comment](https://github.com/CabbageDevelopment/qasync/issues/35#issuecomment-1315060043)
|
||||||
|
by [Alex Marcha](https://github.com/hosaka) who's excellent code helped me.
|
||||||
|
As this is my first use of ```asyncio``` and ```qasync``` I may have
|
||||||
|
introduced some errors and it may be improved on, but it works, and
|
||||||
|
it not a monolithic Qt program.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
The standalone program is ```quash_phantompy.py```
|
||||||
|
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
|
||||||
|
```
|
||||||
|
<url> Can be a http(s) URL or a path to a local file
|
||||||
|
<pdf-file> Path and name of PDF file to generate
|
||||||
|
[<javascript-file>] (optional) Path and name of a JavaScript file to execute
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Postscript
|
||||||
|
|
||||||
|
When I think of all the trouble people went to compiling and
|
||||||
|
maintaining the tonnes of C++ code that went into
|
||||||
|
[phantomjs](https://github.com/ariya/phantomjs), I am amazed that it
|
||||||
|
can be replaced with a couple of hundred lines of Python!
|
||||||
|
|
||||||
|
756
doc/blog.michael.franzl.name/2017/10/16/phantom-py/index.html
Normal file
756
doc/blog.michael.franzl.name/2017/10/16/phantom-py/index.html
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,207 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
# phantom.py
|
||||||
|
|
||||||
|
Simple but fully scriptable headless QtWebKit browser using PyQt5 in Python3,
|
||||||
|
specialized in executing external JavaScript and generating PDF files. A lean
|
||||||
|
replacement for other bulky headless browser frameworks.
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
If you have a display attached:
|
||||||
|
|
||||||
|
./phantom.py <url> <pdf-file> [<javascript-file>]
|
||||||
|
|
||||||
|
If you don't have a display attached (i.e. on a remote server):
|
||||||
|
|
||||||
|
xvfb-run ./phantom.py <url> <pdf-file> [<javascript-file>]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
<url> Can be a http(s) URL or a path to a local file
|
||||||
|
<pdf-file> Path and name of PDF file to generate
|
||||||
|
[<javascript-file>] (optional) Path and name of a JavaScript file to execute
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
* Generate a PDF screenshot of the web page after it is completely loaded.
|
||||||
|
* Optionally execute a local JavaScript file specified by the argument
|
||||||
|
<javascript-file> after the web page is completely loaded, and before
|
||||||
|
the PDF is generated.
|
||||||
|
* console.log's will be printed to stdout.
|
||||||
|
* Easily add new features by changing the source code of this script, without
|
||||||
|
compiling C++ code. For more advanced applications, consider attaching
|
||||||
|
PyQt objects/methods to WebKit's JavaScript space by using
|
||||||
|
`QWebFrame::addToJavaScriptWindowObject()`.
|
||||||
|
|
||||||
|
If you execute an external <javascript-file>, phantom.py has no way of knowing
|
||||||
|
when that script has finished doing its work. For this reason, the external
|
||||||
|
script should execute `console.log("__PHANTOM_PY_DONE__");` when done. This will
|
||||||
|
trigger the PDF generation, after which phantom.py will exit. If no
|
||||||
|
`__PHANTOM_PY_DONE__` string is seen on the console for 10 seconds, phantom.py
|
||||||
|
will exit without doing anything. This behavior could be implemented more
|
||||||
|
elegantly without console.log's but it is the simplest solution.
|
||||||
|
|
||||||
|
It is important to remember that since you're just running WebKit, you can use
|
||||||
|
everything that WebKit supports, including the usual JS client libraries, CSS,
|
||||||
|
CSS @media types, etc.
|
||||||
|
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
* Python3
|
||||||
|
* PyQt5
|
||||||
|
* xvfb (optional for display-less machines)
|
||||||
|
|
||||||
|
Installation of dependencies in Debian Stretch is easy:
|
||||||
|
|
||||||
|
apt-get install xvfb python3-pyqt5 python3-pyqt5.qtwebkit
|
||||||
|
|
||||||
|
Finding the equivalent for other OSes is an exercise that I leave to you.
|
||||||
|
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
Given the following file /tmp/test.html
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>foo <span id="id1">foo</span> <span id="id2">foo</span></p>
|
||||||
|
</body>
|
||||||
|
<script>
|
||||||
|
document.getElementById('id1').innerHTML = "bar";
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
... and the following file /tmp/test.js:
|
||||||
|
|
||||||
|
document.getElementById('id2').innerHTML = "baz";
|
||||||
|
console.log("__PHANTOM_PY_DONE__");
|
||||||
|
|
||||||
|
... and running this script (without attached display) ...
|
||||||
|
|
||||||
|
xvfb-run python3 phantom.py /tmp/test.html /tmp/out.pdf /tmp/test.js
|
||||||
|
|
||||||
|
... you will get a PDF file /tmp/out.pdf with the contents "foo bar baz".
|
||||||
|
|
||||||
|
Note that the second occurrence of "foo" has been replaced by the web page's own
|
||||||
|
script, and the third occurrence of "foo" by the external JS file.
|
||||||
|
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Copyright 2017 Michael Karl Franzl
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
of the Software, and to permit persons to whom the Software is furnished to do
|
||||||
|
so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from PyQt5.QtCore import QUrl
|
||||||
|
from PyQt5.QtWebKitWidgets import QWebPage
|
||||||
|
from PyQt5.QtWidgets import QApplication
|
||||||
|
from PyQt5.QtPrintSupport import QPrinter
|
||||||
|
from PyQt5.QtCore import QTimer
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
|
||||||
|
class Render(QWebPage):
|
||||||
|
def __init__(self, url, outfile, jsfile):
|
||||||
|
self.app = QApplication(sys.argv)
|
||||||
|
|
||||||
|
QWebPage.__init__(self)
|
||||||
|
|
||||||
|
self.jsfile = jsfile
|
||||||
|
self.outfile = outfile
|
||||||
|
|
||||||
|
qurl = QUrl.fromUserInput(url)
|
||||||
|
|
||||||
|
print("phantom.py: URL=", qurl, "OUTFILE=", outfile, "JSFILE=", jsfile)
|
||||||
|
|
||||||
|
# The PDF generation only happens when the special string __PHANTOM_PY_DONE__
|
||||||
|
# is sent to console.log(). The following JS string will be executed by
|
||||||
|
# default, when no external JavaScript file is specified.
|
||||||
|
self.js_contents = "setTimeout(function() { console.log('__PHANTOM_PY_DONE__') }, 500);";
|
||||||
|
|
||||||
|
if jsfile:
|
||||||
|
try:
|
||||||
|
f = open(self.jsfile)
|
||||||
|
self.js_contents = f.read()
|
||||||
|
f.close()
|
||||||
|
except:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
self._exit(10)
|
||||||
|
|
||||||
|
self.loadFinished.connect(self._loadFinished)
|
||||||
|
self.mainFrame().load(qurl)
|
||||||
|
self.javaScriptConsoleMessage = self._onConsoleMessage
|
||||||
|
|
||||||
|
# Run for a maximum of 10 seconds
|
||||||
|
watchdog = QTimer()
|
||||||
|
watchdog.setSingleShot(True)
|
||||||
|
watchdog.timeout.connect(lambda: self._exit(1))
|
||||||
|
watchdog.start(10000)
|
||||||
|
|
||||||
|
self.app.exec_()
|
||||||
|
|
||||||
|
|
||||||
|
def _onConsoleMessage(self, txt, lineno, filename):
|
||||||
|
print("CONSOLE", lineno, txt, filename)
|
||||||
|
if "__PHANTOM_PY_DONE__" in txt:
|
||||||
|
# If we get this magic string, it means that the external JS is done
|
||||||
|
self._print()
|
||||||
|
self._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
def _loadFinished(self, result):
|
||||||
|
print("phantom.py: Loading finished!")
|
||||||
|
print("phantom.py: Evaluating JS from", self.jsfile)
|
||||||
|
self.frame = self.mainFrame()
|
||||||
|
self.frame.evaluateJavaScript(self.js_contents)
|
||||||
|
|
||||||
|
|
||||||
|
def _print(self):
|
||||||
|
print("phantom.py: Printing...")
|
||||||
|
printer = QPrinter()
|
||||||
|
printer.setPageMargins(10, 10, 10, 10, QPrinter.Millimeter)
|
||||||
|
printer.setPaperSize(QPrinter.A4)
|
||||||
|
printer.setCreator("phantom.py by Michael Karl Franzl")
|
||||||
|
printer.setOutputFormat(QPrinter.PdfFormat);
|
||||||
|
printer.setOutputFileName(self.outfile);
|
||||||
|
self.frame.print(printer)
|
||||||
|
|
||||||
|
def _exit(self, val):
|
||||||
|
print("phantom.py: Exiting with val", val)
|
||||||
|
self.app.exit(val) # Qt exit
|
||||||
|
exit(val) # Python exit
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if (len(sys.argv) < 3):
|
||||||
|
print("USAGE: ./phantom.py <url> <pdf-file> [<javascript-file>]")
|
||||||
|
else:
|
||||||
|
url = sys.argv[1]
|
||||||
|
outfile = sys.argv[2]
|
||||||
|
jsfile = sys.argv[3] if len(sys.argv) > 3 else None
|
||||||
|
r = Render(url, outfile, jsfile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
290
phantompy.py
Normal file
290
phantompy.py
Normal file
@ -0,0 +1,290 @@
|
|||||||
|
#!/usr/local/bin/python3.sh
|
||||||
|
# -*-mode: python; indent-tabs-mode: nil; py-indent-offset: 2; coding: utf-8 -*-
|
||||||
|
# https://gist.github.com/michaelfranzl/91f0cc13c56120391b949f885643e974/raw/a0601515e7a575bc4c7d4d2a20973b29b6c6f2df/phantom.py
|
||||||
|
"""
|
||||||
|
# phantom.py
|
||||||
|
|
||||||
|
Simple but fully scriptable headless QtWebKit browser using PyQt5 in Python3,
|
||||||
|
specialized in executing external JavaScript and generating PDF files. A lean
|
||||||
|
replacement for other bulky headless browser frameworks.
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
If you have a display attached:
|
||||||
|
|
||||||
|
./phantom.py <url> <pdf-file> [<javascript-file>]
|
||||||
|
|
||||||
|
If you don't have a display attached (i.e. on a remote server):
|
||||||
|
|
||||||
|
xvfb-run ./phantom.py <url> <pdf-file> [<javascript-file>]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
|
||||||
|
<url> Can be a http(s) URL or a path to a local file
|
||||||
|
<pdf-file> Path and name of PDF file to generate
|
||||||
|
[<javascript-file>] (optional) Path and name of a JavaScript file to execute
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
* Generate a PDF screenshot of the web page after it is completely loaded.
|
||||||
|
* Optionally execute a local JavaScript file specified by the argument
|
||||||
|
<javascript-file> after the web page is completely loaded, and before
|
||||||
|
the PDF is generated.
|
||||||
|
* console.log's will be printed to stdout.
|
||||||
|
* Easily add new features by changing the source code of this script, without
|
||||||
|
compiling C++ code. For more advanced applications, consider attaching
|
||||||
|
PyQt objects/methods to WebKit's JavaScript space by using
|
||||||
|
`QWebFrame::addToJavaScriptWindowObject()`.
|
||||||
|
|
||||||
|
If you execute an external <javascript-file>, phantom.py has no way of knowing
|
||||||
|
when that script has finished doing its work. For this reason, the external
|
||||||
|
script should execute `console.log("__PHANTOM_PY_DONE__");` when done. This will
|
||||||
|
trigger the PDF generation, after which phantom.py will exit. If no
|
||||||
|
`__PHANTOM_PY_DONE__` string is seen on the console for 10 seconds, phantom.py
|
||||||
|
will exit without doing anything. This behavior could be implemented more
|
||||||
|
elegantly without console.log's but it is the simplest solution.
|
||||||
|
|
||||||
|
It is important to remember that since you're just running WebKit, you can use
|
||||||
|
everything that WebKit supports, including the usual JS client libraries, CSS,
|
||||||
|
CSS @media types, etc.
|
||||||
|
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
* Python3
|
||||||
|
* PyQt5
|
||||||
|
* xvfb (optional for display-less machines)
|
||||||
|
|
||||||
|
Installation of dependencies in Debian Stretch is easy:
|
||||||
|
|
||||||
|
apt-get install xvfb python3-pyqt5 python3-pyqt5.qtwebkit
|
||||||
|
|
||||||
|
Finding the equivalent for other OSes is an exercise that I leave to you.
|
||||||
|
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
Given the following file /tmp/test.html
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>foo <span id="id1">foo</span> <span id="id2">foo</span></p>
|
||||||
|
</body>
|
||||||
|
<script>
|
||||||
|
document.getElementById('id1').innerHTML = "bar";
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
... and the following file /tmp/test.js:
|
||||||
|
|
||||||
|
document.getElementById('id2').innerHTML = "baz";
|
||||||
|
console.log("__PHANTOM_PY_DONE__");
|
||||||
|
|
||||||
|
... and running this script (without attached display) ...
|
||||||
|
|
||||||
|
xvfb-run python3 phantom.py /tmp/test.html /tmp/out.pdf /tmp/test.js
|
||||||
|
|
||||||
|
... you will get a PDF file /tmp/out.pdf with the contents "foo bar baz".
|
||||||
|
|
||||||
|
Note that the second occurrence of "foo" has been replaced by the web page's own
|
||||||
|
script, and the third occurrence of "foo" by the external JS file.
|
||||||
|
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Copyright 2017 Michael Karl Franzl
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||||
|
of the Software, and to permit persons to whom the Software is furnished to do
|
||||||
|
so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import atexit
|
||||||
|
import time
|
||||||
|
|
||||||
|
from PyQt5.QtCore import QUrl
|
||||||
|
from PyQt5.QtCore import QTimer
|
||||||
|
from PyQt5.QtWidgets import QApplication
|
||||||
|
from PyQt5.QtPrintSupport import QPrinter
|
||||||
|
from PyQt5.QtWebEngineWidgets import QWebEnginePage
|
||||||
|
|
||||||
|
global LOG
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
def prepare(sdir='/tmp'):
|
||||||
|
sfile = os.path.join(sdir, 'test.js')
|
||||||
|
if not os.path.exists(sfile):
|
||||||
|
with open(sfile, 'wt') as ofd:
|
||||||
|
ofd.write("""
|
||||||
|
document.getElementById('id2').innerHTML = "baz";
|
||||||
|
console.log("__PHANTOM_PY_DONE__");
|
||||||
|
""")
|
||||||
|
LOG.debug(f"wrote {sfile} ")
|
||||||
|
sfile = os.path.join(sdir, 'test.html')
|
||||||
|
if not os.path.exists(sfile):
|
||||||
|
with open(sfile, 'wt') as ofd:
|
||||||
|
ofd.write("""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<p>foo <span id="id1">foo</span> <span id="id2">foo</span></p>
|
||||||
|
</body>
|
||||||
|
<script>
|
||||||
|
document.getElementById('id1').innerHTML = "bar";
|
||||||
|
</script>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
LOG.debug(f"wrote {sfile} ")
|
||||||
|
|
||||||
|
class Render(QWebEnginePage):
|
||||||
|
def __init__(self, app, do_print=False, do_save=True):
|
||||||
|
app.ldone = []
|
||||||
|
self._app = app
|
||||||
|
self.do_print = do_print
|
||||||
|
self.do_save = do_save
|
||||||
|
self.percent = 0
|
||||||
|
self.uri = None
|
||||||
|
self.jsfile = None
|
||||||
|
self.outfile = None
|
||||||
|
QWebEnginePage.__init__(self)
|
||||||
|
|
||||||
|
def run(self, url, outfile, jsfile):
|
||||||
|
self._app.lstart.append(id(self))
|
||||||
|
self.percent = 10
|
||||||
|
self.uri = url
|
||||||
|
self.jsfile = jsfile
|
||||||
|
self.outfile = outfile
|
||||||
|
LOG.debug(f"phantom.py: URL={url} OUTFILE={outfile} JSFILE={jsfile}")
|
||||||
|
qurl = QUrl.fromUserInput(url)
|
||||||
|
|
||||||
|
# The PDF generation only happens when the special string __PHANTOM_PY_DONE__
|
||||||
|
# is sent to console.log(). The following JS string will be executed by
|
||||||
|
# default, when no external JavaScript file is specified.
|
||||||
|
self.js_contents = "setTimeout(function() { console.log('__PHANTOM_PY_DONE__') }, 5000);";
|
||||||
|
|
||||||
|
if jsfile:
|
||||||
|
try:
|
||||||
|
with open(self.jsfile, 'rt') as f:
|
||||||
|
self.js_contents = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
LOG.exception(f"error reading jsfile {self.jsfile}")
|
||||||
|
|
||||||
|
self.loadFinished.connect(self._loadFinished)
|
||||||
|
self.percent = 20
|
||||||
|
self.load(qurl)
|
||||||
|
self.javaScriptConsoleMessage = self._onConsoleMessage
|
||||||
|
LOG.debug(f"phantom.py: loading 10")
|
||||||
|
|
||||||
|
def _onConsoleMessage(self, *args):
|
||||||
|
if len(args) > 3:
|
||||||
|
level, txt, lineno, filename = args
|
||||||
|
else:
|
||||||
|
level = 1
|
||||||
|
txt, lineno, filename = args
|
||||||
|
LOG.debug(f"CONSOLE {lineno} {txt} {filename}")
|
||||||
|
if "__PHANTOM_PY_DONE__" in txt:
|
||||||
|
self.percent = 40
|
||||||
|
# If we get this magic string, it means that the external JS is done
|
||||||
|
if self.do_save:
|
||||||
|
self.toHtml(self._html_callback)
|
||||||
|
return
|
||||||
|
# drop through
|
||||||
|
txt = "__PHANTOM_PY_SAVED__"
|
||||||
|
if "__PHANTOM_PY_SAVED__" in txt:
|
||||||
|
self.percent = 50
|
||||||
|
if self.do_print:
|
||||||
|
self._print()
|
||||||
|
return
|
||||||
|
txt = "__PHANTOM_PY_PRINTED__"
|
||||||
|
if "__PHANTOM_PY_PRINTED__" in txt:
|
||||||
|
self.percent = 60
|
||||||
|
self._exit(level)
|
||||||
|
|
||||||
|
def _loadFinished(self, result):
|
||||||
|
self.percent = 30
|
||||||
|
LOG.info(f"phantom.py: _loadFinished {result} {self.percent}")
|
||||||
|
LOG.debug(f"phantom.py: Evaluating JS from {self.jsfile}")
|
||||||
|
self.runJavaScript("document.documentElement.contentEditable=true")
|
||||||
|
self.runJavaScript(self.js_contents)
|
||||||
|
|
||||||
|
def _html_callback(self, *args):
|
||||||
|
"""print(self, QPrinter, Callable[[bool], None])"""
|
||||||
|
if type(args[0]) is str:
|
||||||
|
self._save(args[0])
|
||||||
|
self._onConsoleMessage(0, "__PHANTOM_PY_SAVED__", 0 , '')
|
||||||
|
|
||||||
|
def _save(self, html):
|
||||||
|
sfile = self.outfile.replace('.pdf','.html')
|
||||||
|
# CompleteHtmlSaveFormat SingleHtmlSaveFormat MimeHtmlSaveFormat
|
||||||
|
with open(sfile, 'wt') as ofd:
|
||||||
|
ofd.write(html)
|
||||||
|
LOG.debug(f"Saved {sfile}")
|
||||||
|
|
||||||
|
def _printer_callback(self, *args):
|
||||||
|
"""print(self, QPrinter, Callable[[bool], None])"""
|
||||||
|
# print(f"_printer_callback {self.outfile} {args}")
|
||||||
|
if args[0] is False:
|
||||||
|
i = 1
|
||||||
|
else:
|
||||||
|
i = 0
|
||||||
|
self._onConsoleMessage(i, "__PHANTOM_PY_PRINTED__", 0 , '')
|
||||||
|
|
||||||
|
def _print(self):
|
||||||
|
sfile = self.outfile.replace('.html', '.pdf')
|
||||||
|
printer = QPrinter()
|
||||||
|
printer.setPageMargins(10, 10, 10, 10, QPrinter.Millimeter)
|
||||||
|
printer.setPaperSize(QPrinter.A4)
|
||||||
|
printer.setCreator("phantom.py by Michael Karl Franzl")
|
||||||
|
printer.setOutputFormat(QPrinter.PdfFormat);
|
||||||
|
printer.setOutputFileName(sfile)
|
||||||
|
self.print(printer, self._printer_callback)
|
||||||
|
LOG.debug("phantom.py: Printed")
|
||||||
|
|
||||||
|
def _exit(self, val):
|
||||||
|
self.percent = 100
|
||||||
|
LOG.debug(f"phantom.py: Exiting with val {val}")
|
||||||
|
# threadsafe?
|
||||||
|
self._app.ldone.append(self.uri)
|
||||||
|
|
||||||
|
def omain(app, largs):
|
||||||
|
if (len(largs) < 2):
|
||||||
|
LOG.info("USAGE: ./phantom.py <url> <pdf-file> [<javascript-file>]")
|
||||||
|
return -1
|
||||||
|
|
||||||
|
url = largs[0]
|
||||||
|
outfile = largs[1]
|
||||||
|
jsfile = largs[2] if len(largs) > 2 else None
|
||||||
|
ilen = 1
|
||||||
|
|
||||||
|
r = Render(app, do_print=False, do_save=True)
|
||||||
|
r.run(url, outfile, jsfile)
|
||||||
|
for i in range(1, 120):
|
||||||
|
app.processEvents()
|
||||||
|
print(f"{app.ldone} {i}")
|
||||||
|
if len(app.ldone) == ilen:
|
||||||
|
print(f"{app.ldone} found {ilen}")
|
||||||
|
app.exit()
|
||||||
|
return r
|
||||||
|
time.sleep(1)
|
||||||
|
return r
|
||||||
|
|
132
qasync_phantompy.py
Normal file
132
qasync_phantompy.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/local/bin/python3.sh
|
||||||
|
# -*-mode: python; indent-tabs-mode: nil; py-indent-offset: 4; coding: utf-8 -*
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import qasync
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
from PyQt5 import QtWidgets
|
||||||
|
from PyQt5.QtWidgets import (QProgressBar, QWidget, QVBoxLayout)
|
||||||
|
|
||||||
|
from phantompy import Render
|
||||||
|
from lookupdns import LookFor as Render
|
||||||
|
|
||||||
|
global LOG
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
class Widget(QtWidgets.QWidget):
|
||||||
|
def __init__(self):
|
||||||
|
QtWidgets.QWidget.__init__(self)
|
||||||
|
self._label = QtWidgets.QLabel()
|
||||||
|
box = QtWidgets.QHBoxLayout()
|
||||||
|
self.setLayout(box)
|
||||||
|
box.addWidget(self._label)
|
||||||
|
self.progress = QProgressBar()
|
||||||
|
self.progress.setRange(0, 99)
|
||||||
|
box.addWidget(self.progress)
|
||||||
|
|
||||||
|
def update(self, text):
|
||||||
|
i = len(asyncio.all_tasks())
|
||||||
|
self._label.setText(str(i))
|
||||||
|
self.progress.setValue(int(text))
|
||||||
|
|
||||||
|
class ContextManager:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._seconds = 0
|
||||||
|
async def __aenter__(self):
|
||||||
|
LOG.debug("ContextManager enter")
|
||||||
|
return self
|
||||||
|
async def __aexit__(self, *args):
|
||||||
|
LOG.debug("ContextManager exit")
|
||||||
|
async def tick(self):
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
self._seconds += 1
|
||||||
|
return self._seconds
|
||||||
|
|
||||||
|
async def main(widget, app, ilen):
|
||||||
|
LOG.debug("Task started")
|
||||||
|
try:
|
||||||
|
async with ContextManager() as ctx:
|
||||||
|
for i in range(1, 120):
|
||||||
|
seconds = await ctx.tick()
|
||||||
|
LOG.info(str(seconds))
|
||||||
|
perc = 50 + int(float(len(app.lfps))*100.0/ilen)
|
||||||
|
if widget:
|
||||||
|
widget.update(str(perc))
|
||||||
|
LOG.debug(f"{app.lfps} {perc} {seconds}")
|
||||||
|
if len(app.lfps) == ilen:
|
||||||
|
print('\n'.join(app.lfps))
|
||||||
|
app.exit()
|
||||||
|
# raise asyncio.CancelledError
|
||||||
|
break
|
||||||
|
except asyncio.CancelledError as ex:
|
||||||
|
LOG.debug("Task cancelled")
|
||||||
|
|
||||||
|
def iMain(largs, bgui=True):
|
||||||
|
app = QtWidgets.QApplication([])
|
||||||
|
app.lstart = []
|
||||||
|
if bgui:
|
||||||
|
widget = Widget()
|
||||||
|
widget._app = app
|
||||||
|
widget.show()
|
||||||
|
else:
|
||||||
|
widget = None
|
||||||
|
|
||||||
|
loop = qasync.QEventLoop(app)
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
|
||||||
|
largs = sys.argv[1:]
|
||||||
|
url = largs[0]
|
||||||
|
outfile = largs[1]
|
||||||
|
jsfile = largs[2] if len(largs) > 2 else None
|
||||||
|
if os.path.exists(url):
|
||||||
|
with open(url, 'rt') as ofd:
|
||||||
|
elts = ofd.readlines()
|
||||||
|
random.shuffle(elts)
|
||||||
|
lelts = elts[:4]
|
||||||
|
else:
|
||||||
|
lelts = [url]
|
||||||
|
for i, elt in enumerate(lelts):
|
||||||
|
# run only starts the url loading
|
||||||
|
r = Render(app, do_print=False, do_save=True)
|
||||||
|
uri = elt.strip()
|
||||||
|
r.run(uri, outfile, jsfile)
|
||||||
|
per = int(float(i)*100.0/2/len(lelts))
|
||||||
|
LOG.debug(f"{r.percent} {app.lstart} {per} {i}")
|
||||||
|
if len(lelts) == 1: break
|
||||||
|
for j in range(1, random.randint(30, 120)):
|
||||||
|
# google throttles too many links at a time
|
||||||
|
if widget:
|
||||||
|
widget.update(str(per))
|
||||||
|
app.processEvents()
|
||||||
|
time.sleep(1)
|
||||||
|
LOG.info(f"queued {len(app.lstart)} urls")
|
||||||
|
|
||||||
|
# run until app.exec() is finished (Qt window is closed)
|
||||||
|
task = loop.create_task(main(widget, app, len(lelts)))
|
||||||
|
loop.run_forever()
|
||||||
|
|
||||||
|
# cancel remaining tasks and wait for them to complete
|
||||||
|
task.cancel()
|
||||||
|
tasks = asyncio.all_tasks()
|
||||||
|
loop.run_until_complete(asyncio.gather(*tasks))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
from exclude_badExits import vsetup_logging
|
||||||
|
d = int(os.environ.get('DEBUG', 0))
|
||||||
|
if d > 0:
|
||||||
|
vsetup_logging(10, stream=sys.stderr)
|
||||||
|
else:
|
||||||
|
vsetup_logging(20, stream=sys.stderr)
|
||||||
|
vsetup_logging(log_level, logfile='', stream=sys.stderr)
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
iMain(sys.argv[1:], bgui=False)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user