release version 0.4.9
This commit is contained in:
49
Readme.md
49
Readme.md
@@ -1,6 +1,6 @@
|
|||||||
# pdfgrab
|
# pdfgrab
|
||||||
|
|
||||||
* Version 0.4.8-Pre
|
* Version 0.4.9
|
||||||
|
|
||||||
## What is it?
|
## What is it?
|
||||||
|
|
||||||
@@ -9,21 +9,18 @@ Basically it analyses PDF files for Metadata. You can direct it to a file or dir
|
|||||||
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
|
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
|
||||||
to search for pdfs at target site, download and analyse them.
|
to search for pdfs at target site, download and analyse them.
|
||||||
|
|
||||||
## What is new in 0.4.8 bug fix pre-release?
|
## What is new in 0.4.9?
|
||||||
|
|
||||||
* catching google error at too many requests
|
* exported reporting methods to libreport.py
|
||||||
* catching dns resolve urlopen error at googlelib
|
* added optargs for disabling different report methods
|
||||||
* fixing annoying bug in regard of pdfs behind urls like http://host/pdf/
|
* made the html report a bit more shiny
|
||||||
* fixing zero size pdf error(online linked pdfs which are not accessable)
|
* added function for generating html report after analysis
|
||||||
* added some logging
|
* exported requests and storing data to new library
|
||||||
|
* code fixes and more clear error handling
|
||||||
## What is new in 0.4.7 release?
|
* removed necessary site: parameter at search flag -s
|
||||||
|
* updated readme
|
||||||
* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file
|
* -s flag now acceppts several domains
|
||||||
* Added basic logging support, logfile is placed in pdfgrab.py directory
|
* console logging more clean
|
||||||
* Reordered Codebase, exported functionality to some libraries
|
|
||||||
* PDF XMP Metadata is grabbed now as well, but not yet saved in output files
|
|
||||||
* added docs/ section with Changelog and Todo
|
|
||||||
|
|
||||||
## What information can be gathered?
|
## What information can be gathered?
|
||||||
|
|
||||||
@@ -132,7 +129,7 @@ Will analyse all pdf's in that directory
|
|||||||
|
|
||||||
### Google Search Mode
|
### Google Search Mode
|
||||||
```
|
```
|
||||||
# ./pdfgrab.py -s site:kernel.org
|
# ./pdfgrab.py -s kernel.org
|
||||||
```
|
```
|
||||||
Result:
|
Result:
|
||||||
```
|
```
|
||||||
@@ -164,6 +161,26 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
|
|||||||
/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
|
/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Google Search Mode, several domains
|
||||||
|
```
|
||||||
|
# ./pdfgrab.py -s example.com,example.us
|
||||||
|
```
|
||||||
|
|
||||||
|
### Reporting
|
||||||
|
|
||||||
|
pdfgrab outputs the information in different formats. If not disabled by one of the reporting flags (see -h) you will
|
||||||
|
find in the output directory:
|
||||||
|
|
||||||
|
* html report
|
||||||
|
* text report
|
||||||
|
* text url list
|
||||||
|
* json data
|
||||||
|
* json url list
|
||||||
|
|
||||||
|
### Logging
|
||||||
|
|
||||||
|
pdfgrab creates a logfile in the running directory called "pdfgrab.log"
|
||||||
|
|
||||||
## Google
|
## Google
|
||||||
|
|
||||||
* Search: filetype:pdf site:com
|
* Search: filetype:pdf site:com
|
||||||
|
|||||||
@@ -1,6 +1,20 @@
|
|||||||
Changelog
|
Changelog
|
||||||
=========
|
=========
|
||||||
|
|
||||||
|
Version 4.9
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* exported reporting methods to libreport.py
|
||||||
|
* added optargs for disabling different report methods
|
||||||
|
* made the html report a bit more shiny
|
||||||
|
* added function for generating html report after analysis
|
||||||
|
* exported requests and storing data to new library
|
||||||
|
* code fixes and more clear error handling
|
||||||
|
* removed necessary site: parameter at search flag -s
|
||||||
|
* updated readme
|
||||||
|
* -s flag now acceppts several domains
|
||||||
|
* console logging more clean
|
||||||
|
|
||||||
Version 4.8 Bugfix-PreRelease
|
Version 4.8 Bugfix-PreRelease
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
|
|||||||
@@ -5,19 +5,44 @@ from libs.libhelper import *
|
|||||||
def get_random_agent():
|
def get_random_agent():
|
||||||
return (gs.get_random_user_agent())
|
return (gs.get_random_user_agent())
|
||||||
|
|
||||||
def search_pdf(search, args):
|
def hits_google(search, args):
|
||||||
|
''' the function where googlesearch from mario vilas
|
||||||
|
is called
|
||||||
|
'''
|
||||||
|
s = search.split(',')
|
||||||
|
query = 'filetype:pdf'
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
hits = gs.hits(query, domains=s,user_agent=gs.get_random_user_agent())
|
||||||
|
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
return False,e
|
||||||
|
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
return False,e
|
||||||
|
|
||||||
|
except IndexError as e:
|
||||||
|
return False,e
|
||||||
|
|
||||||
|
return True,hits
|
||||||
|
|
||||||
|
|
||||||
|
def search_google(search, args):
|
||||||
''' the function where googlesearch from mario vilas
|
''' the function where googlesearch from mario vilas
|
||||||
is called
|
is called
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
s = search.split(',')
|
||||||
search_stop = args.search_stop
|
search_stop = args.search_stop
|
||||||
|
|
||||||
query = '%s filetype:pdf' % search
|
query = 'filetype:pdf'
|
||||||
|
#query = 'site:%s filetype:pdf' % search
|
||||||
# print(query)
|
# print(query)
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
|
for url in gs.search(query, num=20, domains=s,stop=search_stop, user_agent=gs.get_random_user_agent()):
|
||||||
#print(url)
|
#print(url)
|
||||||
urls.append(url)
|
urls.append(url)
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,12 @@ file_handler = logging.FileHandler('pdfgrab.log')
|
|||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler()
|
||||||
console_handler.setLevel(logging.WARNING)
|
console_handler.setLevel(logging.WARNING)
|
||||||
|
|
||||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
|
file_formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
|
||||||
|
console_formatter = logging.Formatter('%(levelname)s:%(message)s')
|
||||||
|
|
||||||
file_handler.setFormatter(formatter)
|
|
||||||
console_handler.setFormatter(formatter)
|
file_handler.setFormatter(file_formatter)
|
||||||
|
console_handler.setFormatter(console_formatter)
|
||||||
|
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
logger.addHandler(console_handler)
|
logger.addHandler(console_handler)
|
||||||
|
|||||||
173
libs/libreport.py
Normal file
173
libs/libreport.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from json2html import *
|
||||||
|
from libs.pdf_png import get_png_base64
|
||||||
|
|
||||||
|
def prepare_analysis_dict(ana_queue):
|
||||||
|
'''params: ana_queue - queue with collected information
|
||||||
|
'''
|
||||||
|
# initiate analysis dictionary
|
||||||
|
analysis_dict = {}
|
||||||
|
|
||||||
|
# move analysis dictionary in queue back to dictionary
|
||||||
|
while ana_queue.empty() == False:
|
||||||
|
item = ana_queue.get()
|
||||||
|
# print('item ', item)
|
||||||
|
analysis_dict.update(item)
|
||||||
|
|
||||||
|
# ana_q is empty now return the newly created dictionary
|
||||||
|
return analysis_dict
|
||||||
|
|
||||||
|
def create_txt_report(analysis_dict, outdir, out_filename):
|
||||||
|
''' create a txt report in the output directory
|
||||||
|
'''
|
||||||
|
|
||||||
|
# draw seperator lines
|
||||||
|
sep = '-' * 80 + '\n'
|
||||||
|
|
||||||
|
# create output filepath
|
||||||
|
txtout = "%s/%s.txt" % (outdir, out_filename)
|
||||||
|
|
||||||
|
# open the file and return filedescriptor
|
||||||
|
fwtxt = open(txtout, 'w')
|
||||||
|
|
||||||
|
# get the keys of the dict
|
||||||
|
for k in analysis_dict.keys():
|
||||||
|
# write seperator
|
||||||
|
fwtxt.write(sep)
|
||||||
|
|
||||||
|
# build entry filename of the pdf
|
||||||
|
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
|
||||||
|
|
||||||
|
# build data entry
|
||||||
|
ddata = analysis_dict[k]['data']
|
||||||
|
|
||||||
|
# write the filename
|
||||||
|
fwtxt.write(fname)
|
||||||
|
|
||||||
|
# write the metadata
|
||||||
|
for kdata in ddata.keys():
|
||||||
|
metatxt = '%s:%s\n' % (kdata, ddata[kdata])
|
||||||
|
fwtxt.write(metatxt)
|
||||||
|
|
||||||
|
# write seperator
|
||||||
|
fwtxt.write(sep)
|
||||||
|
|
||||||
|
# close the file
|
||||||
|
fwtxt.close()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_json_report(analysis_dict, outdir, out_filename):
|
||||||
|
''' create a jsonfile report in the output directory
|
||||||
|
'''
|
||||||
|
|
||||||
|
# build json output name
|
||||||
|
jsonout = "%s/%s.json" % (outdir, out_filename)
|
||||||
|
|
||||||
|
# open up json output file
|
||||||
|
fwjson = open(jsonout, 'w')
|
||||||
|
|
||||||
|
# convert dictionary to json data
|
||||||
|
jdata = json.dumps(analysis_dict)
|
||||||
|
|
||||||
|
# write json data to file
|
||||||
|
fwjson.write(jdata)
|
||||||
|
|
||||||
|
# close file
|
||||||
|
fwjson.close()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_html_report(analysis_dict, outdir, out_filename):
|
||||||
|
''' create a html report from json file using json2html in the output directory
|
||||||
|
'''
|
||||||
|
|
||||||
|
# build up path for html output file
|
||||||
|
htmlout = "%s/%s.html" % (outdir, out_filename)
|
||||||
|
|
||||||
|
# open htmlout filedescriptor
|
||||||
|
fwhtml = open(htmlout,'w')
|
||||||
|
|
||||||
|
# some html stuff
|
||||||
|
pdfpng=get_png_base64('supply/pdf_base64.png')
|
||||||
|
html_style ='<style>.center { display: block; margin-left: auto;margin-right: auto;} table {border-collapse: collapse;} th, td { border: 1px solid black;text-align: left; }</style>\n'
|
||||||
|
html_head = '<html><head><title>pdfgrab - {0} item/s</title>{1}</head>\n'.format(len(analysis_dict),html_style)
|
||||||
|
html_pdf_png = '<p class="center"><img class="center" src="data:image/jpeg;base64,{0}"><br><center>pdfgrab - grab and analyse pdf files</center><br></p>'.format(pdfpng)
|
||||||
|
html_body = '<body>{0}\n'.format(html_pdf_png)
|
||||||
|
html_end = '\n<br><br><p align="center"><a href="https://github.com/c0decave/pdfgrab">pdfgrab</a> by <a href="https://twitter.com/User_to_Root">dash</a></p></body></html>\n'
|
||||||
|
|
||||||
|
# some attributes
|
||||||
|
attr = 'id="meta-data" class="table table-bordered table-hover", border=1, cellpadding=3 summary="Metadata"'
|
||||||
|
|
||||||
|
# convert dictionary to json data
|
||||||
|
# in this mode each finding gets its own table there are other possibilities
|
||||||
|
# but now i go with this
|
||||||
|
html_out = ''
|
||||||
|
for k in analysis_dict.keys():
|
||||||
|
trans = analysis_dict[k]
|
||||||
|
jdata = json.dumps(trans)
|
||||||
|
html = json2html.convert(json = jdata, table_attributes=attr)
|
||||||
|
html_out = html_out + html + "\n"
|
||||||
|
#html_out = html_out + "<p>" + html + "</p>\n"
|
||||||
|
#jdata = json.dumps(analysis_dict)
|
||||||
|
|
||||||
|
# create html
|
||||||
|
#html = json2html.convert(json = jdata, table_attributes=attr)
|
||||||
|
|
||||||
|
# write html
|
||||||
|
fwhtml.write(html_head)
|
||||||
|
fwhtml.write(html_body)
|
||||||
|
fwhtml.write(html_out)
|
||||||
|
fwhtml.write(html_end)
|
||||||
|
|
||||||
|
# close html file
|
||||||
|
fwhtml.close()
|
||||||
|
|
||||||
|
def create_url_json(url_d, outdir, out_filename):
|
||||||
|
''' create a json url file in output directory
|
||||||
|
'''
|
||||||
|
|
||||||
|
# create url savefile
|
||||||
|
jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
|
||||||
|
|
||||||
|
# open up file for writting urls down
|
||||||
|
fwjson = open(jsonurlout, 'w')
|
||||||
|
|
||||||
|
# convert url dictionary to json
|
||||||
|
jdata = json.dumps(url_d)
|
||||||
|
|
||||||
|
# write json data to file
|
||||||
|
fwjson.write(jdata)
|
||||||
|
|
||||||
|
# close filedescriptor
|
||||||
|
fwjson.close()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_url_txt(url_d, outdir, out_filename):
|
||||||
|
''' create a txt url file in output directory
|
||||||
|
'''
|
||||||
|
# build up txt out path
|
||||||
|
txtout = "%s/%s_url.txt" % (outdir, out_filename)
|
||||||
|
|
||||||
|
# open up our url txtfile
|
||||||
|
fwtxt = open(txtout, 'w')
|
||||||
|
|
||||||
|
# iterating through the keys of the url dictionary
|
||||||
|
for k in url_d.keys():
|
||||||
|
|
||||||
|
# get the entry
|
||||||
|
ddata = url_d[k]
|
||||||
|
|
||||||
|
# create meta data for saving
|
||||||
|
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
|
||||||
|
|
||||||
|
# write metadata to file
|
||||||
|
fwtxt.write(metatxt)
|
||||||
|
|
||||||
|
# close fd
|
||||||
|
fwtxt.close()
|
||||||
|
|
||||||
|
return True
|
||||||
162
libs/librequest.py
Normal file
162
libs/librequest.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import socket
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from libs.liblog import logger
|
||||||
|
from libs.libhelper import *
|
||||||
|
from libs.libgoogle import get_random_agent
|
||||||
|
|
||||||
|
def store_file(url, data, outdir):
|
||||||
|
''' storing the downloaded data to a file
|
||||||
|
params: url - is used to create the filename
|
||||||
|
data - the data of the file
|
||||||
|
outdir - to store in which directory
|
||||||
|
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
|
||||||
|
'''
|
||||||
|
|
||||||
|
logger.info('Store file {0}'.format(url))
|
||||||
|
name = find_name(url)
|
||||||
|
|
||||||
|
# only allow stored file a name with 50 chars
|
||||||
|
if len(name) > 50:
|
||||||
|
name = name[:49]
|
||||||
|
|
||||||
|
# build up the save path
|
||||||
|
save = "%s/%s" % (outdir, name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
f = open(save, "wb")
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning('store_file {0}'.format(e))
|
||||||
|
# return ret_dict
|
||||||
|
return {"code":False,"data":save,"error":e}
|
||||||
|
|
||||||
|
# write the data and return the written bytes
|
||||||
|
ret = f.write(data)
|
||||||
|
|
||||||
|
# check if bytes are zero
|
||||||
|
if ret == 0:
|
||||||
|
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
|
||||||
|
|
||||||
|
else:
|
||||||
|
# log to info that bytes and file has been written
|
||||||
|
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
|
||||||
|
|
||||||
|
# close file descriptor
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
# return ret_dict
|
||||||
|
return {"code":True,"data":save,"error":False}
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, args, header_data):
|
||||||
|
''' downloading the file for later analysis
|
||||||
|
params: url - the url
|
||||||
|
args - argparse args namespace
|
||||||
|
header_data - pre-defined header data
|
||||||
|
returns: ret_dict
|
||||||
|
'''
|
||||||
|
|
||||||
|
# check the remote tls certificate or not?
|
||||||
|
cert_check = args.cert_check
|
||||||
|
|
||||||
|
# run our try catch routine
|
||||||
|
try:
|
||||||
|
# request the url and save the response in req
|
||||||
|
# give header data and set verify as delivered by args.cert_check
|
||||||
|
req = requests.get(url, headers=header_data, verify=cert_check)
|
||||||
|
|
||||||
|
except requests.exceptions.SSLError as e:
|
||||||
|
logger.warning('download file {0}{1}'.format(url,e))
|
||||||
|
|
||||||
|
# return retdict
|
||||||
|
return {"code":False,"data":req,"error":e}
|
||||||
|
|
||||||
|
except requests.exceptions.InvalidSchema as e:
|
||||||
|
logger.warning('download file {0}{1}'.format(url,e))
|
||||||
|
|
||||||
|
# return retdict
|
||||||
|
return {"code":False,"data":False,"error":e}
|
||||||
|
|
||||||
|
except socket.gaierror as e:
|
||||||
|
logger.warning('download file, host not known {0} {1}'.format(url,e))
|
||||||
|
return {"code":False,"data":False,"error":e}
|
||||||
|
|
||||||
|
except:
|
||||||
|
logger.warning('download file, something wrong with remote server? {0}'.format(url))
|
||||||
|
# return retdict
|
||||||
|
if not req in locals():
|
||||||
|
req = False
|
||||||
|
|
||||||
|
return {"code":False,"data":req,"error":True}
|
||||||
|
|
||||||
|
#finally:
|
||||||
|
# lets close the socket
|
||||||
|
#req.close()
|
||||||
|
|
||||||
|
# return retdict
|
||||||
|
return {"code":True,"data":req,"error":False}
|
||||||
|
|
||||||
|
def grab_run(url, args, outdir):
|
||||||
|
''' function keeping all the steps for the user call of grabbing
|
||||||
|
just one and analysing it
|
||||||
|
'''
|
||||||
|
header_data = {'User-Agent': get_random_agent()}
|
||||||
|
rd_download = download_file(url, args, header_data)
|
||||||
|
code_down = rd_download['code']
|
||||||
|
|
||||||
|
# is code True download of file was successfull
|
||||||
|
if code_down:
|
||||||
|
rd_evaluate = evaluate_response(rd_download)
|
||||||
|
code_eval = rd_evaluate['code']
|
||||||
|
# if code is True, evaluation was also successful
|
||||||
|
if code_eval:
|
||||||
|
# get the content from the evaluate dictionary request
|
||||||
|
content = rd_evaluate['data'].content
|
||||||
|
|
||||||
|
# call store file
|
||||||
|
rd_store = store_file(url, content, outdir)
|
||||||
|
|
||||||
|
# get the code
|
||||||
|
code_store = rd_store['code']
|
||||||
|
|
||||||
|
# get the savepath
|
||||||
|
savepath = rd_store['data']
|
||||||
|
|
||||||
|
# if code is True, storing of file was also successfull
|
||||||
|
if code_store:
|
||||||
|
return {"code":True,"data":savepath,"error":False}
|
||||||
|
|
||||||
|
return {"code":False,"data":False,"error":True}
|
||||||
|
|
||||||
|
def evalute_content(ret_dict):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def evaluate_response(ret_dict):
|
||||||
|
''' this method comes usually after download_file,
|
||||||
|
it will evaluate what has happened and if we even have some data to process
|
||||||
|
or not
|
||||||
|
params: data - is the req object from the conducted request
|
||||||
|
return: {}
|
||||||
|
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
|
||||||
|
'''
|
||||||
|
# extract data from ret_dict
|
||||||
|
req = ret_dict['data']
|
||||||
|
|
||||||
|
# get status code
|
||||||
|
url = req.url
|
||||||
|
status = req.status_code
|
||||||
|
reason = req.reason
|
||||||
|
|
||||||
|
# ahh everything is fine
|
||||||
|
if status == 200:
|
||||||
|
logger.info('download file, {0} {1} {2}'.format(url,reason,status))
|
||||||
|
return {"code":True,"data":req,"error":False}
|
||||||
|
|
||||||
|
# nah something is not like it should be
|
||||||
|
else:
|
||||||
|
logger.warning('download file, {0} {1} {2}'.format(url,reason,status))
|
||||||
|
return {"code":False,"data":req,"error":True}
|
||||||
5
libs/pdf_png.py
Normal file
5
libs/pdf_png.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
|
||||||
|
def get_png_base64(filename):
|
||||||
|
fr = open(filename,'r')
|
||||||
|
buf = fr.read()
|
||||||
|
return buf
|
||||||
195
pdfgrab.py
195
pdfgrab.py
@@ -22,12 +22,14 @@ from PyPDF2 import pdf
|
|||||||
from libs.liblog import logger
|
from libs.liblog import logger
|
||||||
from libs.libhelper import *
|
from libs.libhelper import *
|
||||||
from libs.libgoogle import *
|
from libs.libgoogle import *
|
||||||
|
from libs.libreport import *
|
||||||
|
from libs.librequest import grab_run
|
||||||
|
|
||||||
from IPython import embed
|
from IPython import embed
|
||||||
|
|
||||||
# some variables in regard of the tool itself
|
# some variables in regard of the tool itself
|
||||||
name = 'pdfgrab'
|
name = 'pdfgrab'
|
||||||
version = '0.4.8-Pre'
|
version = '0.4.9'
|
||||||
author = 'dash'
|
author = 'dash'
|
||||||
date = 'November 2019'
|
date = 'November 2019'
|
||||||
|
|
||||||
@@ -243,72 +245,6 @@ def check_encryption(filename):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def download_pdf(url, args, header_data):
|
|
||||||
''' downloading the pdfile for later analysis '''
|
|
||||||
|
|
||||||
# check the remote tls certificate or not?
|
|
||||||
cert_check = args.cert_check
|
|
||||||
|
|
||||||
try:
|
|
||||||
req = requests.get(url, headers=header_data, verify=cert_check)
|
|
||||||
# req = requests.get(url,headers=header_data,verify=False)
|
|
||||||
data = req.content
|
|
||||||
status_code = req.status_code
|
|
||||||
|
|
||||||
except requests.exceptions.SSLError as e:
|
|
||||||
logger.warning('download pdf {0}{1}'.format(url,e))
|
|
||||||
return -1
|
|
||||||
|
|
||||||
except:
|
|
||||||
logger.warning('download pdf, something wrong with remote server? {0}'.format(url))
|
|
||||||
return -1
|
|
||||||
|
|
||||||
if status_code == 403:
|
|
||||||
logger.warning('download pdf, 403 Forbidden {0}'.format(url))
|
|
||||||
return -1
|
|
||||||
|
|
||||||
# print(len(data))
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def store_pdf(url, data, outdir):
|
|
||||||
''' storing the downloaded pdf data
|
|
||||||
'''
|
|
||||||
|
|
||||||
logger.info('Store pdf {0}'.format(url))
|
|
||||||
name = find_name(url)
|
|
||||||
#logger.warning(url)
|
|
||||||
#logger.warning(name)
|
|
||||||
#logger.warning(outdir)
|
|
||||||
|
|
||||||
# only allow stored file a name with 50 chars
|
|
||||||
if len(name) > 50:
|
|
||||||
name = name[:49] + '.pdf'
|
|
||||||
# print(len(name))
|
|
||||||
|
|
||||||
save = "%s/%s" % (outdir, name)
|
|
||||||
|
|
||||||
try:
|
|
||||||
f = open(save, "wb")
|
|
||||||
except OSError as e:
|
|
||||||
logger.warning('store_pdf {0}'.format(e))
|
|
||||||
return -1
|
|
||||||
|
|
||||||
ret = f.write(data)
|
|
||||||
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
if ret == 0:
|
|
||||||
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
|
|
||||||
return save
|
|
||||||
#return -1
|
|
||||||
|
|
||||||
|
|
||||||
# return the savepath
|
|
||||||
return save
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_pdf(filename):
|
def _parse_pdf(filename):
|
||||||
''' the real parsing function '''
|
''' the real parsing function '''
|
||||||
|
|
||||||
@@ -320,26 +256,18 @@ def _parse_pdf(filename):
|
|||||||
logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
|
logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def grab_url(url, args, outdir):
|
|
||||||
''' function keeping all the steps for the user call of grabbing
|
|
||||||
just one pdf and analysing it
|
|
||||||
'''
|
|
||||||
header_data = {'User-Agent': get_random_agent()}
|
|
||||||
data = download_pdf(url, args, header_data)
|
|
||||||
if data != -1:
|
|
||||||
savepath = store_pdf(url, data, outdir)
|
|
||||||
_parse_pdf(savepath)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def seek_and_analyse(search, args, outdir):
|
def seek_and_analyse(search, args, outdir):
|
||||||
''' function for keeping all the steps of searching for pdfs and analysing
|
''' function for keeping all the steps of searching for pdfs and analysing
|
||||||
them together
|
them together
|
||||||
'''
|
'''
|
||||||
|
# check how many hits we got
|
||||||
|
# seems like the method is broken in googlsearch library :(
|
||||||
|
#code, hits = hits_google(search,args)
|
||||||
|
#if code:
|
||||||
|
# print('Got {0} hits'.format(hits))
|
||||||
|
|
||||||
# use the search function of googlesearch to get the results
|
# use the search function of googlesearch to get the results
|
||||||
code, values=search_pdf(search, args)
|
code, values=search_google(search, args)
|
||||||
if not code:
|
if not code:
|
||||||
if values.code == 429:
|
if values.code == 429:
|
||||||
logger.warning('[-] Too many requests, time to change ip address or use proxychains')
|
logger.warning('[-] Too many requests, time to change ip address or use proxychains')
|
||||||
@@ -362,7 +290,11 @@ def seek_and_analyse(search, args, outdir):
|
|||||||
item = url_q.get()
|
item = url_q.get()
|
||||||
# print(item)
|
# print(item)
|
||||||
url = item['url']
|
url = item['url']
|
||||||
grab_url(url, args, outdir)
|
rd_grabrun = grab_run(url, args, outdir)
|
||||||
|
code = rd_grabrun['code']
|
||||||
|
savepath = rd_grabrun['data']
|
||||||
|
if code:
|
||||||
|
_parse_pdf(savepath)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -372,6 +304,9 @@ def run(args):
|
|||||||
# initialize logger
|
# initialize logger
|
||||||
logger.info('{0} Started'.format(name))
|
logger.info('{0} Started'.format(name))
|
||||||
|
|
||||||
|
# create some variables
|
||||||
|
|
||||||
|
|
||||||
# outfile name
|
# outfile name
|
||||||
if args.outfile:
|
if args.outfile:
|
||||||
out_filename = args.outfile
|
out_filename = args.outfile
|
||||||
@@ -381,6 +316,7 @@ def run(args):
|
|||||||
# specify output directory
|
# specify output directory
|
||||||
outdir = args.outdir
|
outdir = args.outdir
|
||||||
|
|
||||||
|
|
||||||
# create output directory
|
# create output directory
|
||||||
make_directory(outdir)
|
make_directory(outdir)
|
||||||
|
|
||||||
@@ -417,68 +353,43 @@ def run(args):
|
|||||||
fpath = '%s/%s' % (directory, f)
|
fpath = '%s/%s' % (directory, f)
|
||||||
_parse_pdf(fpath)
|
_parse_pdf(fpath)
|
||||||
|
|
||||||
|
# simply generate html report from json outfile
|
||||||
|
elif args.gen_html_report:
|
||||||
|
fr = open(args.gen_html_report,'r')
|
||||||
|
analysis_dict = json.loads(fr.read())
|
||||||
|
if create_html_report(analysis_dict, outdir,out_filename):
|
||||||
|
logger.info('Successfully created html report')
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
|
print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# move analysis dictionary in queue back to dictionary
|
# creating the analysis dictionary for reporting
|
||||||
analysis_dict = {}
|
analysis_dict = prepare_analysis_dict(ana_q)
|
||||||
while ana_q.empty() == False:
|
|
||||||
item = ana_q.get()
|
|
||||||
# print('item ', item)
|
|
||||||
analysis_dict.update(item)
|
|
||||||
|
|
||||||
#print('dict:',analysis_dict)
|
# lets go through the different reporting types
|
||||||
# ana_q is empty now
|
if args.report_txt:
|
||||||
|
if create_txt_report(analysis_dict, outdir,out_filename):
|
||||||
|
logger.info('Successfully created txt report')
|
||||||
|
|
||||||
# create txt output
|
if args.report_json:
|
||||||
sep = '-' * 80 + '\n'
|
if create_json_report(analysis_dict, outdir,out_filename):
|
||||||
txtout = "%s/%s.txt" % (outdir, out_filename)
|
logger.info('Successfully created json report')
|
||||||
fwtxt = open(txtout, 'w')
|
|
||||||
# print(analysis_dict)
|
|
||||||
for k in analysis_dict.keys():
|
|
||||||
fwtxt.write(sep)
|
|
||||||
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
|
|
||||||
ddata = analysis_dict[k]['data']
|
|
||||||
fwtxt.write(fname)
|
|
||||||
for kdata in ddata.keys():
|
|
||||||
metatxt = '%s:%s\n' % (kdata, ddata[kdata])
|
|
||||||
fwtxt.write(metatxt)
|
|
||||||
fwtxt.write(sep)
|
|
||||||
fwtxt.close()
|
|
||||||
|
|
||||||
# create json output
|
if args.report_html:
|
||||||
jsonout = "%s/%s.json" % (outdir, out_filename)
|
if create_html_report(analysis_dict, outdir,out_filename):
|
||||||
fwjson = open(jsonout, 'w')
|
logger.info('Successfully created html report')
|
||||||
|
|
||||||
# print(analysis_dict)
|
if args.report_url_txt:
|
||||||
jdata = json.dumps(analysis_dict)
|
if create_url_txt(url_d, outdir,out_filename):
|
||||||
fwjson.write(jdata)
|
logger.info('Successfully created txt url report')
|
||||||
fwjson.close()
|
|
||||||
|
|
||||||
# create html from json
|
if args.report_url_json:
|
||||||
htmlout = "%s/%s.html" % (outdir, out_filename)
|
if create_url_json(url_d, outdir,out_filename):
|
||||||
fwhtml = open(htmlout,'w')
|
logger.info('Successfully created json url report')
|
||||||
#print(jdata)
|
|
||||||
html = json2html.convert(json = jdata)
|
|
||||||
fwhtml.write(html)
|
|
||||||
fwhtml.close()
|
|
||||||
|
|
||||||
|
|
||||||
# create url savefile
|
|
||||||
# print('url_d: ', url_d)
|
|
||||||
jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
|
|
||||||
fwjson = open(jsonurlout, 'w')
|
|
||||||
jdata = json.dumps(url_d)
|
|
||||||
fwjson.write(jdata)
|
|
||||||
fwjson.close()
|
|
||||||
|
|
||||||
txtout = "%s/%s_url.txt" % (outdir, out_filename)
|
|
||||||
fwtxt = open(txtout, 'w')
|
|
||||||
for k in url_d.keys():
|
|
||||||
ddata = url_d[k]
|
|
||||||
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
|
|
||||||
fwtxt.write(metatxt)
|
|
||||||
fwtxt.close()
|
|
||||||
|
|
||||||
return 42
|
return 42
|
||||||
|
|
||||||
@@ -504,8 +415,14 @@ def main():
|
|||||||
help="specify domain or tld to scrape for pdf-files", default=None)
|
help="specify domain or tld to scrape for pdf-files", default=None)
|
||||||
parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
|
parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
|
||||||
help="specify how many files are searched", default=10, type=int)
|
help="specify how many files are searched", default=10, type=int)
|
||||||
parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
|
parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,help="if the target domain(s) run with old or bad certificates", default=True)
|
||||||
help="if the target domain(s) run with old or bad certificates", default=True)
|
|
||||||
|
parser.add_argument('-ghr', '--gen-html-report', action='store', dest='gen_html_report', required=False,help="If you want to generate the html report after editing the json outfile (parameter: pdfgrab_analysis.json)")
|
||||||
|
parser.add_argument('-rtd', '--report-text-disable', action='store_false', dest='report_txt', required=False,help="Disable txt report",default=True)
|
||||||
|
parser.add_argument('-rjd', '--report-json-disable', action='store_false', dest='report_json', required=False,help="Disable json report",default=True)
|
||||||
|
parser.add_argument('-rhd', '--report-html-disable', action='store_false', dest='report_html', required=False,help="Disable html report",default=True)
|
||||||
|
parser.add_argument('-rutd', '--report-url-text-disable', action='store_false', dest='report_url_txt', required=False,help="Disable url txt report",default=True)
|
||||||
|
parser.add_argument('-rujd', '--report-url-json-disable', action='store_false', dest='report_url_json', required=False,help="Disable url json report",default=True)
|
||||||
|
|
||||||
if len(sys.argv)<2:
|
if len(sys.argv)<2:
|
||||||
parser.print_help(sys.stderr)
|
parser.print_help(sys.stderr)
|
||||||
|
|||||||
BIN
supply/pdf.png
Normal file
BIN
supply/pdf.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
1
supply/pdf_base64.png
Normal file
1
supply/pdf_base64.png
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user