release version 0.4.9

This commit is contained in:
c0decave
2019-11-07 12:51:30 +01:00
parent a89ac93c3d
commit 132750867f
10 changed files with 477 additions and 161 deletions

View File

@@ -1,6 +1,6 @@
# pdfgrab # pdfgrab
* Version 0.4.8-Pre * Version 0.4.9
## What is it? ## What is it?
@@ -9,21 +9,18 @@ Basically it analyses PDF files for Metadata. You can direct it to a file or dir
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class) You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
to search for pdfs at target site, download and analyse them. to search for pdfs at target site, download and analyse them.
## What is new in 0.4.8 bug fix pre-release? ## What is new in 0.4.9?
* catching google error at too many requests * exported reporting methods to libreport.py
* catching dns resolve urlopen error at googlelib * added optargs for disabling different report methods
* fixing annoying bug in regard of pdfs behind urls like http://host/pdf/ * made the html report a bit more shiny
* fixing zero size pdf error(online linked pdfs which are not accessable) * added function for generating html report after analysis
* added some logging * exported requests and storing data to new library
* code fixes and more clear error handling
## What is new in 0.4.7 release? * removed necessary site: parameter at search flag -s
* updated readme
* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file * -s flag now acceppts several domains
* Added basic logging support, logfile is placed in pdfgrab.py directory * console logging more clean
* Reordered Codebase, exported functionality to some libraries
* PDF XMP Metadata is grabbed now as well, but not yet saved in output files
* added docs/ section with Changelog and Todo
## What information can be gathered? ## What information can be gathered?
@@ -132,7 +129,7 @@ Will analyse all pdf's in that directory
### Google Search Mode ### Google Search Mode
``` ```
# ./pdfgrab.py -s site:kernel.org # ./pdfgrab.py -s kernel.org
``` ```
Result: Result:
``` ```
@@ -164,6 +161,26 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2 /PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
``` ```
### Google Search Mode, several domains
```
# ./pdfgrab.py -s example.com,example.us
```
### Reporting
pdfgrab outputs the information in different formats. If not disabled by one of the reporting flags (see -h) you will
find in the output directory:
* html report
* text report
* text url list
* json data
* json url list
### Logging
pdfgrab creates a logfile in the running directory called "pdfgrab.log"
## Google ## Google
* Search: filetype:pdf site:com * Search: filetype:pdf site:com

View File

@@ -1,6 +1,20 @@
Changelog Changelog
========= =========
Version 4.9
-----------
* exported reporting methods to libreport.py
* added optargs for disabling different report methods
* made the html report a bit more shiny
* added function for generating html report after analysis
* exported requests and storing data to new library
* code fixes and more clear error handling
* removed necessary site: parameter at search flag -s
* updated readme
* -s flag now acceppts several domains
* console logging more clean
Version 4.8 Bugfix-PreRelease Version 4.8 Bugfix-PreRelease
----------------------------- -----------------------------

View File

@@ -5,19 +5,44 @@ from libs.libhelper import *
def get_random_agent(): def get_random_agent():
return (gs.get_random_user_agent()) return (gs.get_random_user_agent())
def search_pdf(search, args): def hits_google(search, args):
''' the function where googlesearch from mario vilas
is called
'''
s = search.split(',')
query = 'filetype:pdf'
try:
hits = gs.hits(query, domains=s,user_agent=gs.get_random_user_agent())
except urllib.error.HTTPError as e:
return False,e
except urllib.error.URLError as e:
return False,e
except IndexError as e:
return False,e
return True,hits
def search_google(search, args):
''' the function where googlesearch from mario vilas ''' the function where googlesearch from mario vilas
is called is called
''' '''
s = search.split(',')
search_stop = args.search_stop search_stop = args.search_stop
query = '%s filetype:pdf' % search query = 'filetype:pdf'
#query = 'site:%s filetype:pdf' % search
# print(query) # print(query)
urls = [] urls = []
try: try:
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()): for url in gs.search(query, num=20, domains=s,stop=search_stop, user_agent=gs.get_random_user_agent()):
#print(url) #print(url)
urls.append(url) urls.append(url)

View File

@@ -8,10 +8,12 @@ file_handler = logging.FileHandler('pdfgrab.log')
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING) console_handler.setLevel(logging.WARNING)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s') file_formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
console_formatter = logging.Formatter('%(levelname)s:%(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter) file_handler.setFormatter(file_formatter)
console_handler.setFormatter(console_formatter)
logger.addHandler(file_handler) logger.addHandler(file_handler)
logger.addHandler(console_handler) logger.addHandler(console_handler)

173
libs/libreport.py Normal file
View File

@@ -0,0 +1,173 @@
import os
import sys
import json
from json2html import *
from libs.pdf_png import get_png_base64
def prepare_analysis_dict(ana_queue):
'''params: ana_queue - queue with collected information
'''
# initiate analysis dictionary
analysis_dict = {}
# move analysis dictionary in queue back to dictionary
while ana_queue.empty() == False:
item = ana_queue.get()
# print('item ', item)
analysis_dict.update(item)
# ana_q is empty now return the newly created dictionary
return analysis_dict
def create_txt_report(analysis_dict, outdir, out_filename):
''' create a txt report in the output directory
'''
# draw seperator lines
sep = '-' * 80 + '\n'
# create output filepath
txtout = "%s/%s.txt" % (outdir, out_filename)
# open the file and return filedescriptor
fwtxt = open(txtout, 'w')
# get the keys of the dict
for k in analysis_dict.keys():
# write seperator
fwtxt.write(sep)
# build entry filename of the pdf
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
# build data entry
ddata = analysis_dict[k]['data']
# write the filename
fwtxt.write(fname)
# write the metadata
for kdata in ddata.keys():
metatxt = '%s:%s\n' % (kdata, ddata[kdata])
fwtxt.write(metatxt)
# write seperator
fwtxt.write(sep)
# close the file
fwtxt.close()
return True
def create_json_report(analysis_dict, outdir, out_filename):
''' create a jsonfile report in the output directory
'''
# build json output name
jsonout = "%s/%s.json" % (outdir, out_filename)
# open up json output file
fwjson = open(jsonout, 'w')
# convert dictionary to json data
jdata = json.dumps(analysis_dict)
# write json data to file
fwjson.write(jdata)
# close file
fwjson.close()
return True
def create_html_report(analysis_dict, outdir, out_filename):
''' create a html report from json file using json2html in the output directory
'''
# build up path for html output file
htmlout = "%s/%s.html" % (outdir, out_filename)
# open htmlout filedescriptor
fwhtml = open(htmlout,'w')
# some html stuff
pdfpng=get_png_base64('supply/pdf_base64.png')
html_style ='<style>.center { display: block; margin-left: auto;margin-right: auto;} table {border-collapse: collapse;} th, td { border: 1px solid black;text-align: left; }</style>\n'
html_head = '<html><head><title>pdfgrab - {0} item/s</title>{1}</head>\n'.format(len(analysis_dict),html_style)
html_pdf_png = '<p class="center"><img class="center" src="data:image/jpeg;base64,{0}"><br><center>pdfgrab - grab and analyse pdf files</center><br></p>'.format(pdfpng)
html_body = '<body>{0}\n'.format(html_pdf_png)
html_end = '\n<br><br><p align="center"><a href="https://github.com/c0decave/pdfgrab">pdfgrab</a> by <a href="https://twitter.com/User_to_Root">dash</a></p></body></html>\n'
# some attributes
attr = 'id="meta-data" class="table table-bordered table-hover", border=1, cellpadding=3 summary="Metadata"'
# convert dictionary to json data
# in this mode each finding gets its own table there are other possibilities
# but now i go with this
html_out = ''
for k in analysis_dict.keys():
trans = analysis_dict[k]
jdata = json.dumps(trans)
html = json2html.convert(json = jdata, table_attributes=attr)
html_out = html_out + html + "\n"
#html_out = html_out + "<p>" + html + "</p>\n"
#jdata = json.dumps(analysis_dict)
# create html
#html = json2html.convert(json = jdata, table_attributes=attr)
# write html
fwhtml.write(html_head)
fwhtml.write(html_body)
fwhtml.write(html_out)
fwhtml.write(html_end)
# close html file
fwhtml.close()
def create_url_json(url_d, outdir, out_filename):
''' create a json url file in output directory
'''
# create url savefile
jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
# open up file for writting urls down
fwjson = open(jsonurlout, 'w')
# convert url dictionary to json
jdata = json.dumps(url_d)
# write json data to file
fwjson.write(jdata)
# close filedescriptor
fwjson.close()
return True
def create_url_txt(url_d, outdir, out_filename):
''' create a txt url file in output directory
'''
# build up txt out path
txtout = "%s/%s_url.txt" % (outdir, out_filename)
# open up our url txtfile
fwtxt = open(txtout, 'w')
# iterating through the keys of the url dictionary
for k in url_d.keys():
# get the entry
ddata = url_d[k]
# create meta data for saving
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
# write metadata to file
fwtxt.write(metatxt)
# close fd
fwtxt.close()
return True

162
libs/librequest.py Normal file
View File

@@ -0,0 +1,162 @@
import os
import sys
import json
import socket
import requests
from libs.liblog import logger
from libs.libhelper import *
from libs.libgoogle import get_random_agent
def store_file(url, data, outdir):
''' storing the downloaded data to a file
params: url - is used to create the filename
data - the data of the file
outdir - to store in which directory
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
'''
logger.info('Store file {0}'.format(url))
name = find_name(url)
# only allow stored file a name with 50 chars
if len(name) > 50:
name = name[:49]
# build up the save path
save = "%s/%s" % (outdir, name)
try:
f = open(save, "wb")
except OSError as e:
logger.warning('store_file {0}'.format(e))
# return ret_dict
return {"code":False,"data":save,"error":e}
# write the data and return the written bytes
ret = f.write(data)
# check if bytes are zero
if ret == 0:
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
else:
# log to info that bytes and file has been written
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
# close file descriptor
f.close()
# return ret_dict
return {"code":True,"data":save,"error":False}
def download_file(url, args, header_data):
''' downloading the file for later analysis
params: url - the url
args - argparse args namespace
header_data - pre-defined header data
returns: ret_dict
'''
# check the remote tls certificate or not?
cert_check = args.cert_check
# run our try catch routine
try:
# request the url and save the response in req
# give header data and set verify as delivered by args.cert_check
req = requests.get(url, headers=header_data, verify=cert_check)
except requests.exceptions.SSLError as e:
logger.warning('download file {0}{1}'.format(url,e))
# return retdict
return {"code":False,"data":req,"error":e}
except requests.exceptions.InvalidSchema as e:
logger.warning('download file {0}{1}'.format(url,e))
# return retdict
return {"code":False,"data":False,"error":e}
except socket.gaierror as e:
logger.warning('download file, host not known {0} {1}'.format(url,e))
return {"code":False,"data":False,"error":e}
except:
logger.warning('download file, something wrong with remote server? {0}'.format(url))
# return retdict
if not req in locals():
req = False
return {"code":False,"data":req,"error":True}
#finally:
# lets close the socket
#req.close()
# return retdict
return {"code":True,"data":req,"error":False}
def grab_run(url, args, outdir):
''' function keeping all the steps for the user call of grabbing
just one and analysing it
'''
header_data = {'User-Agent': get_random_agent()}
rd_download = download_file(url, args, header_data)
code_down = rd_download['code']
# is code True download of file was successfull
if code_down:
rd_evaluate = evaluate_response(rd_download)
code_eval = rd_evaluate['code']
# if code is True, evaluation was also successful
if code_eval:
# get the content from the evaluate dictionary request
content = rd_evaluate['data'].content
# call store file
rd_store = store_file(url, content, outdir)
# get the code
code_store = rd_store['code']
# get the savepath
savepath = rd_store['data']
# if code is True, storing of file was also successfull
if code_store:
return {"code":True,"data":savepath,"error":False}
return {"code":False,"data":False,"error":True}
def evalute_content(ret_dict):
pass
def evaluate_response(ret_dict):
''' this method comes usually after download_file,
it will evaluate what has happened and if we even have some data to process
or not
params: data - is the req object from the conducted request
return: {}
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
'''
# extract data from ret_dict
req = ret_dict['data']
# get status code
url = req.url
status = req.status_code
reason = req.reason
# ahh everything is fine
if status == 200:
logger.info('download file, {0} {1} {2}'.format(url,reason,status))
return {"code":True,"data":req,"error":False}
# nah something is not like it should be
else:
logger.warning('download file, {0} {1} {2}'.format(url,reason,status))
return {"code":False,"data":req,"error":True}

5
libs/pdf_png.py Normal file
View File

@@ -0,0 +1,5 @@
def get_png_base64(filename):
fr = open(filename,'r')
buf = fr.read()
return buf

View File

@@ -22,12 +22,14 @@ from PyPDF2 import pdf
from libs.liblog import logger from libs.liblog import logger
from libs.libhelper import * from libs.libhelper import *
from libs.libgoogle import * from libs.libgoogle import *
from libs.libreport import *
from libs.librequest import grab_run
from IPython import embed from IPython import embed
# some variables in regard of the tool itself # some variables in regard of the tool itself
name = 'pdfgrab' name = 'pdfgrab'
version = '0.4.8-Pre' version = '0.4.9'
author = 'dash' author = 'dash'
date = 'November 2019' date = 'November 2019'
@@ -243,72 +245,6 @@ def check_encryption(filename):
return True return True
def download_pdf(url, args, header_data):
''' downloading the pdfile for later analysis '''
# check the remote tls certificate or not?
cert_check = args.cert_check
try:
req = requests.get(url, headers=header_data, verify=cert_check)
# req = requests.get(url,headers=header_data,verify=False)
data = req.content
status_code = req.status_code
except requests.exceptions.SSLError as e:
logger.warning('download pdf {0}{1}'.format(url,e))
return -1
except:
logger.warning('download pdf, something wrong with remote server? {0}'.format(url))
return -1
if status_code == 403:
logger.warning('download pdf, 403 Forbidden {0}'.format(url))
return -1
# print(len(data))
return data
def store_pdf(url, data, outdir):
''' storing the downloaded pdf data
'''
logger.info('Store pdf {0}'.format(url))
name = find_name(url)
#logger.warning(url)
#logger.warning(name)
#logger.warning(outdir)
# only allow stored file a name with 50 chars
if len(name) > 50:
name = name[:49] + '.pdf'
# print(len(name))
save = "%s/%s" % (outdir, name)
try:
f = open(save, "wb")
except OSError as e:
logger.warning('store_pdf {0}'.format(e))
return -1
ret = f.write(data)
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
f.close()
if ret == 0:
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
return save
#return -1
# return the savepath
return save
def _parse_pdf(filename): def _parse_pdf(filename):
''' the real parsing function ''' ''' the real parsing function '''
@@ -320,26 +256,18 @@ def _parse_pdf(filename):
logger.warning('Filesize is 0 bytes at file: {0}'.format(filename)) logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
return False return False
def grab_url(url, args, outdir):
''' function keeping all the steps for the user call of grabbing
just one pdf and analysing it
'''
header_data = {'User-Agent': get_random_agent()}
data = download_pdf(url, args, header_data)
if data != -1:
savepath = store_pdf(url, data, outdir)
_parse_pdf(savepath)
return
def seek_and_analyse(search, args, outdir): def seek_and_analyse(search, args, outdir):
''' function for keeping all the steps of searching for pdfs and analysing ''' function for keeping all the steps of searching for pdfs and analysing
them together them together
''' '''
# check how many hits we got
# seems like the method is broken in googlsearch library :(
#code, hits = hits_google(search,args)
#if code:
# print('Got {0} hits'.format(hits))
# use the search function of googlesearch to get the results # use the search function of googlesearch to get the results
code, values=search_pdf(search, args) code, values=search_google(search, args)
if not code: if not code:
if values.code == 429: if values.code == 429:
logger.warning('[-] Too many requests, time to change ip address or use proxychains') logger.warning('[-] Too many requests, time to change ip address or use proxychains')
@@ -362,7 +290,11 @@ def seek_and_analyse(search, args, outdir):
item = url_q.get() item = url_q.get()
# print(item) # print(item)
url = item['url'] url = item['url']
grab_url(url, args, outdir) rd_grabrun = grab_run(url, args, outdir)
code = rd_grabrun['code']
savepath = rd_grabrun['data']
if code:
_parse_pdf(savepath)
return True return True
@@ -372,6 +304,9 @@ def run(args):
# initialize logger # initialize logger
logger.info('{0} Started'.format(name)) logger.info('{0} Started'.format(name))
# create some variables
# outfile name # outfile name
if args.outfile: if args.outfile:
out_filename = args.outfile out_filename = args.outfile
@@ -381,6 +316,7 @@ def run(args):
# specify output directory # specify output directory
outdir = args.outdir outdir = args.outdir
# create output directory # create output directory
make_directory(outdir) make_directory(outdir)
@@ -417,68 +353,43 @@ def run(args):
fpath = '%s/%s' % (directory, f) fpath = '%s/%s' % (directory, f)
_parse_pdf(fpath) _parse_pdf(fpath)
# simply generate html report from json outfile
elif args.gen_html_report:
fr = open(args.gen_html_report,'r')
analysis_dict = json.loads(fr.read())
if create_html_report(analysis_dict, outdir,out_filename):
logger.info('Successfully created html report')
sys.exit(0)
else:
sys.exit(1)
else: else:
print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0])) print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
sys.exit(1)
# move analysis dictionary in queue back to dictionary # creating the analysis dictionary for reporting
analysis_dict = {} analysis_dict = prepare_analysis_dict(ana_q)
while ana_q.empty() == False:
item = ana_q.get()
# print('item ', item)
analysis_dict.update(item)
#print('dict:',analysis_dict) # lets go through the different reporting types
# ana_q is empty now if args.report_txt:
if create_txt_report(analysis_dict, outdir,out_filename):
logger.info('Successfully created txt report')
# create txt output if args.report_json:
sep = '-' * 80 + '\n' if create_json_report(analysis_dict, outdir,out_filename):
txtout = "%s/%s.txt" % (outdir, out_filename) logger.info('Successfully created json report')
fwtxt = open(txtout, 'w')
# print(analysis_dict)
for k in analysis_dict.keys():
fwtxt.write(sep)
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
ddata = analysis_dict[k]['data']
fwtxt.write(fname)
for kdata in ddata.keys():
metatxt = '%s:%s\n' % (kdata, ddata[kdata])
fwtxt.write(metatxt)
fwtxt.write(sep)
fwtxt.close()
# create json output if args.report_html:
jsonout = "%s/%s.json" % (outdir, out_filename) if create_html_report(analysis_dict, outdir,out_filename):
fwjson = open(jsonout, 'w') logger.info('Successfully created html report')
# print(analysis_dict) if args.report_url_txt:
jdata = json.dumps(analysis_dict) if create_url_txt(url_d, outdir,out_filename):
fwjson.write(jdata) logger.info('Successfully created txt url report')
fwjson.close()
# create html from json if args.report_url_json:
htmlout = "%s/%s.html" % (outdir, out_filename) if create_url_json(url_d, outdir,out_filename):
fwhtml = open(htmlout,'w') logger.info('Successfully created json url report')
#print(jdata)
html = json2html.convert(json = jdata)
fwhtml.write(html)
fwhtml.close()
# create url savefile
# print('url_d: ', url_d)
jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
fwjson = open(jsonurlout, 'w')
jdata = json.dumps(url_d)
fwjson.write(jdata)
fwjson.close()
txtout = "%s/%s_url.txt" % (outdir, out_filename)
fwtxt = open(txtout, 'w')
for k in url_d.keys():
ddata = url_d[k]
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
fwtxt.write(metatxt)
fwtxt.close()
return 42 return 42
@@ -504,8 +415,14 @@ def main():
help="specify domain or tld to scrape for pdf-files", default=None) help="specify domain or tld to scrape for pdf-files", default=None)
parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False, parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
help="specify how many files are searched", default=10, type=int) help="specify how many files are searched", default=10, type=int)
parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False, parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,help="if the target domain(s) run with old or bad certificates", default=True)
help="if the target domain(s) run with old or bad certificates", default=True)
parser.add_argument('-ghr', '--gen-html-report', action='store', dest='gen_html_report', required=False,help="If you want to generate the html report after editing the json outfile (parameter: pdfgrab_analysis.json)")
parser.add_argument('-rtd', '--report-text-disable', action='store_false', dest='report_txt', required=False,help="Disable txt report",default=True)
parser.add_argument('-rjd', '--report-json-disable', action='store_false', dest='report_json', required=False,help="Disable json report",default=True)
parser.add_argument('-rhd', '--report-html-disable', action='store_false', dest='report_html', required=False,help="Disable html report",default=True)
parser.add_argument('-rutd', '--report-url-text-disable', action='store_false', dest='report_url_txt', required=False,help="Disable url txt report",default=True)
parser.add_argument('-rujd', '--report-url-json-disable', action='store_false', dest='report_url_json', required=False,help="Disable url json report",default=True)
if len(sys.argv)<2: if len(sys.argv)<2:
parser.print_help(sys.stderr) parser.print_help(sys.stderr)

BIN
supply/pdf.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

1
supply/pdf_base64.png Normal file

File diff suppressed because one or more lines are too long