release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list

This commit is contained in:
c0decave
2019-11-05 14:42:24 +01:00
parent fa3b925d6f
commit e1d7c3f760
7 changed files with 476 additions and 388 deletions

20
docs/Changelog Normal file
View File

@@ -0,0 +1,20 @@
Changelog
=========
Version 4.7
-----------
* added html out
* added xmp meta testing
Version 4.6
-----------
* added help for non-argument given at cli
* added googlesearch lib
Version 4.5
-----------
* exported helper functions to libs/helper.py
* added libs/liblog.py

4
docs/Todo Normal file
View File

@@ -0,0 +1,4 @@
* add xmp meta to output files
* code reordering
* clean up parsing functions
* add report formats

0
libs/__init__.py Normal file
View File

30
libs/libgoogle.py Normal file
View File

@@ -0,0 +1,30 @@
import googlesearch as gs
import urllib
from libs.libhelper import *
def get_random_agent():
return (gs.get_random_user_agent())
def search_pdf(search, args):
''' the function where googlesearch from mario vilas
is called
'''
search_stop = args.search_stop
query = '%s filetype:pdf' % search
# print(query)
urls = []
try:
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
#print(url)
urls.append(url)
except urllib.error.HTTPError as e:
print('Error: %s' % e)
return -1
return urls

37
libs/libhelper.py Normal file
View File

@@ -0,0 +1,37 @@
import os
import sys
from Crypto.Hash import SHA256
def make_directory(outdir):
''' naive mkdir function '''
try:
os.mkdir(outdir)
except:
# print("[W] mkdir, some error, directory probably exists")
pass
def url_strip(url):
url = url.rstrip("\n")
url = url.rstrip("\r")
return url
def create_sha256(hdata):
''' introduced to create hashes of filenames, to have a uniqid
of course hashes of the file itself will be the next topic
'''
hobject = SHA256.new(data=hdata.encode())
return (hobject.hexdigest())
def find_name(pdf):
''' simply parses the urlencoded name and extracts the storage name
i would not be surprised this naive approach can lead to fuckups
'''
# find the name of the file
name = pdf.split("/")
a = len(name)
name = name[a - 1]
# print(name)
return name

17
libs/liblog.py Normal file
View File

@@ -0,0 +1,17 @@
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('pdfgrab.log')
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)

View File

@@ -1,59 +1,33 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
##################### #####################
# yay - old tool adjusted for python3, using googlesearch now
# and not some self crafted f00
#
# new features, new layout, new new :> # new features, new layout, new new :>
# by dash at the end of September 2019 # by dash
#
# TODO
# * add complete path in output as well as url where pdf came from
# -> if url not exist like -F mode, then the local path
# * clean up code
# * fine tune google search
# * add random timeout for new requests
# -> maybe not necessary, gs has it ...
# -> sort of necessary, on the other hand use proxychains man
# * uh oh some fancy c0l0rs
# * add thread support
# * add scrape mode, to search for pdfs at the website itself
# * add current error conditions to logfile
#
# Done
# * add url list to output
# * queues added, but no thread support yet
# * json file output
# * txt file output
# * outfilename hardcoded
# * add decryption routine
# * catch ssl exceptions
# * add random useragent for google and website pdf gathering
# * set option for certificate verification, default is true
# * catch conn refused connections
# * catch filename to long thingy
import os import xml
import sys import argparse
import json import json
import os
import queue import queue
import urllib import urllib
import argparse from json2html import *
import requests
# remove somewhen ;)
from IPython import embed
from PyPDF2 import pdf
import PyPDF2 import PyPDF2
from Crypto.Hash import SHA256
from collections import deque
# googlesearch library # googlesearch library
import googlesearch as gs import googlesearch as gs
import requests
from PyPDF2 import pdf
# functions to extern files
from libs.liblog import logger
from libs.libhelper import *
from libs.libgoogle import *
from IPython import embed
# some variables in regard of the tool itself # some variables in regard of the tool itself
name = 'pdfgrab' name = 'pdfgrab'
version = '0.4.4' version = '0.4.7'
author = 'dash' author = 'dash'
date = '2019' date = '2019'
@@ -68,12 +42,16 @@ pdf_q = queue.Queue()
# this is the analysis queue, keeping the data for further processing # this is the analysis queue, keeping the data for further processing
ana_q = queue.Queue() ana_q = queue.Queue()
def create_sha256(hdata): def add_queue(tqueue, data):
''' introduced to create hashes of filenames, to have a uniqid ''' wrapper function for adding easy data to
of course hashes of the file itself will be the next topic created queues. otherwise the functions will be scattered with
endless queue commands ;)
''' '''
hobject = SHA256.new(data=hdata.encode())
return (hobject.hexdigest()) tqueue.put(data)
# d=tqueue.get()
#logging.debug(d)
return True
def process_queue_data(filename, data, queue_type): def process_queue_data(filename, data, queue_type):
''' main function for processing gathered data ''' main function for processing gathered data
@@ -85,7 +63,7 @@ def process_queue_data(filename,data,queue_type):
url_dict = {} url_dict = {}
if queue_type == 'doc_info': if queue_type == 'doc_info':
print('[v] Queue DocInfo Data %s' % (filename)) logger.info('Queue DocInfo Data {0}'.format(filename))
name = find_name(filename) name = find_name(filename)
path = filename path = filename
@@ -95,15 +73,19 @@ def process_queue_data(filename,data,queue_type):
# order data in dict for analyse queue # order data in dict for analyse queue
ana_dict = {path: {'filename': name, 'data': data}} ana_dict = {path: {'filename': name, 'data': data}}
# print(data) #print('data:',data)
# print(ana_dict) #print('ana_dcit:',ana_dict)
# add the data to queue # add the data to queue
add_queue(ana_q, ana_dict) add_queue(ana_q, ana_dict)
elif queue_type == 'doc_xmp_info':
logger.info('Queue DocXMPInfo Data {0}'.format(filename))
logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
elif queue_type == 'url': elif queue_type == 'url':
# prepare queue entry # prepare queue entry
print('[v] Url Queue %s' % (data)) logger.info('Url Queue {0}'.format(data))
url_dict = {'url': data, 'filename': filename} url_dict = {'url': data, 'filename': filename}
sha256 = create_sha256(data) sha256 = create_sha256(data)
url_d[sha256] = url_dict url_d[sha256] = url_dict
@@ -113,28 +95,37 @@ def process_queue_data(filename,data,queue_type):
else: else:
print('[-] Sorry, unknown queue. DEBUG!') print('[-] Sorry, unknown queue. DEBUG!')
logger.critical('Unknown queue')
return False return False
return True return True
def add_queue(tqueue, data): def get_xmp_meta_data(filename, filehandle):
''' wrapper function for adding easy data to ''' get the xmp meta data
created queues. otherwise the functions will be scattered with
endless queue commands ;)
''' '''
tqueue.put(data) err_dict = {}
#d=tqueue.get() real_extract = {}
#print(d) xmp_dict = {}
return True
def url_strip(url): fh = filehandle
url = url.rstrip("\n")
url = url.rstrip("\r")
return url
def get_random_agent(): try:
return (gs.get_random_user_agent()) xmp_meta = fh.getXmpMetadata()
except xml.parsers.expat.ExpatError as e:
print('Error: %s' % e)
err_dict = {'error': str(e)}
return -1
finally:
process_queue_data(filename, err_dict, 'doc_xmp_info')
if xmp_meta != None:
print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
xmp_dict = {}
return xmp_dict
def get_DocInfo(filename, filehandle): def get_DocInfo(filename, filehandle):
''' the easy way to extract metadata ''' the easy way to extract metadata
@@ -183,7 +174,6 @@ def get_DocInfo(filename, filehandle):
process_queue_data(filename, err_dict, 'doc_info') process_queue_data(filename, err_dict, 'doc_info')
return -1 return -1
try: try:
for k in extract.keys(): for k in extract.keys():
key = str(k) key = str(k)
@@ -200,7 +190,6 @@ def get_DocInfo(filename, filehandle):
process_queue_data(filename, err_dict, 'doc_info') process_queue_data(filename, err_dict, 'doc_info')
return -1 return -1
process_queue_data(filename, real_extract, 'doc_info') process_queue_data(filename, real_extract, 'doc_info')
@@ -237,34 +226,16 @@ def check_encryption(filename):
nfr = decrypt_empty_pdf(filename) nfr = decrypt_empty_pdf(filename)
if nfr != -1: if nfr != -1:
get_DocInfo(filename, nfr) get_DocInfo(filename, nfr)
get_xmp_meta_data(filename,nfr)
else: else:
get_DocInfo(filename, fr) get_DocInfo(filename, fr)
get_xmp_meta_data(filename,fr)
# fr.close() # fr.close()
return True return True
def find_name(pdf):
''' simply parses the urlencoded name and extracts the storage name
i would not be surprised this naive approach can lead to fuckups
'''
#find the name of the file
name = pdf.split("/")
a = len(name)
name = name[a-1]
#print(name)
return name
def make_directory(outdir):
''' naive mkdir function '''
try:
os.mkdir(outdir)
except:
#print("[W] mkdir, some error, directory probably exists")
pass
def download_pdf(url, args, header_data): def download_pdf(url, args, header_data):
''' downloading the pdfile for later analysis ''' ''' downloading the pdfile for later analysis '''
@@ -293,10 +264,12 @@ def download_pdf(url, args, header_data):
# print(len(data)) # print(len(data))
return data return data
def store_pdf(url, data, outdir): def store_pdf(url, data, outdir):
''' storing the downloaded pdf data ''' storing the downloaded pdf data
''' '''
print('[v] store_pdf')
logger.info('Store pdf')
name = find_name(url) name = find_name(url)
# only allow stored file a name with 50 chars # only allow stored file a name with 50 chars
@@ -313,18 +286,20 @@ def store_pdf(url,data,outdir):
return -1 return -1
ret = f.write(data) ret = f.write(data)
print('[+] Written %d bytes for File: %s' % (ret,save)) logger.info('Written {0} bytes for file: {1}'.format(ret,save))
f.close() f.close()
# return the savepath # return the savepath
return save return save
def _parse_pdf(filename): def _parse_pdf(filename):
''' the real parsing function ''' ''' the real parsing function '''
ret = check_encryption(filename) ret = check_encryption(filename)
return ret return ret
def grab_url(url, args, outdir): def grab_url(url, args, outdir):
''' function keeping all the steps for the user call of grabbing ''' function keeping all the steps for the user call of grabbing
just one pdf and analysing it just one pdf and analysing it
@@ -337,14 +312,18 @@ def grab_url(url, args, outdir):
return return
def seek_and_analyse(search, args, outdir): def seek_and_analyse(search, args, outdir):
''' function for keeping all the steps of searching for pdfs and analysing ''' function for keeping all the steps of searching for pdfs and analysing
them together them together
''' '''
# use the search function of googlesearch to get the results # use the search function of googlesearch to get the results
search_pdf(search,args) urls=search_pdf(search, args)
#urls = search_pdf(search,args) for item in urls:
filename = find_name(item)
process_queue_data(filename, item, 'url')
# urls = search_pdf(search,args)
# *if* we get an answer # *if* we get an answer
if url_q.empty() == False: if url_q.empty() == False:
@@ -356,33 +335,13 @@ def seek_and_analyse(search,args,outdir):
url = item['url'] url = item['url']
grab_url(url, args, outdir) grab_url(url, args, outdir)
def search_pdf(search, args):
''' the function where googlesearch from mario vilas
is called
'''
search_stop = args.search_stop
query='%s filetype:pdf' % search
#print(query)
urls = []
try:
for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
#print(url)
# parse out the name of the file in the url
filename=find_name(url)
# add the file to queue
process_queue_data(filename,url,'url')
urls.append(url)
except urllib.error.HTTPError as e:
print('Error: %s' % e)
return -1
#return urls
def run(args): def run(args):
# initialize logger
logger.info('{0} Started'.format(name))
# outfile name # outfile name
if args.outfile: if args.outfile:
out_filename = args.outfile out_filename = args.outfile
@@ -398,27 +357,27 @@ def run(args):
# lets see what the object is # lets see what the object is
if args.url_single: if args.url_single:
url = args.url_single url = args.url_single
print('[+] Grabbing %s' % (url)) logger.info('Grabbing {0}'.format(url))
logger.write_to_log('Grabbing %s' % (url))
grab_url(url, args, outdir) grab_url(url, args, outdir)
elif args.file_single: elif args.file_single:
pdffile = args.file_single pdffile = args.file_single
print('[+] Parsing %s' % (pdffile)) logger.info('Parsing {0}'.format(pdffile))
_parse_pdf(pdffile) _parse_pdf(pdffile)
elif args.search: elif args.search:
search = args.search search = args.search
#print(args) logger.info('Seek and analyse {0}'.format(search))
print('[+] Seek and de...erm...analysing %s' % (search))
seek_and_analyse(search, args, outdir) seek_and_analyse(search, args, outdir)
elif args.files_dir: elif args.files_dir:
directory = args.files_dir directory = args.files_dir
print('[+] Analyse pdfs in directory %s' % (directory)) logger.info('Analyse pdfs in directory {0}'.format(directory))
try: try:
files = os.listdir(directory) files = os.listdir(directory)
except: except:
print('Error') logger.warning('Error in args.files_dir')
return False return False
for f in files: for f in files:
@@ -428,7 +387,7 @@ def run(args):
_parse_pdf(fpath) _parse_pdf(fpath)
else: else:
print('[-] Dunno what to do, bro.') print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
# move analysis dictionary in queue back to dictionary # move analysis dictionary in queue back to dictionary
analysis_dict = {} analysis_dict = {}
@@ -437,6 +396,7 @@ def run(args):
# print('item ', item) # print('item ', item)
analysis_dict.update(item) analysis_dict.update(item)
#print('dict:',analysis_dict)
# ana_q is empty now # ana_q is empty now
# create txt output # create txt output
@@ -458,15 +418,21 @@ def run(args):
# create json output # create json output
jsonout = "%s/%s.json" % (outdir, out_filename) jsonout = "%s/%s.json" % (outdir, out_filename)
fwjson = open(jsonout, 'w') fwjson = open(jsonout, 'w')
#for k in analysis_dict.keys():
#print(analysis_dict[k])
# jdata = json.dumps(analysis_dict[k])
# print(analysis_dict) # print(analysis_dict)
jdata = json.dumps(analysis_dict) jdata = json.dumps(analysis_dict)
fwjson.write(jdata) fwjson.write(jdata)
fwjson.close() fwjson.close()
# create html from json
htmlout = "%s/%s.html" % (outdir, out_filename)
fwhtml = open(htmlout,'w')
#print(jdata)
html = json2html.convert(json = jdata)
fwhtml.write(html)
fwhtml.close()
# create url savefile # create url savefile
# print('url_d: ', url_d) # print('url_d: ', url_d)
jsonurlout = "%s/%s_url.json" % (outdir, out_filename) jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
@@ -475,7 +441,6 @@ def run(args):
fwjson.write(jdata) fwjson.write(jdata)
fwjson.close() fwjson.close()
txtout = "%s/%s_url.txt" % (outdir, out_filename) txtout = "%s/%s_url.txt" % (outdir, out_filename)
fwtxt = open(txtout, 'w') fwtxt = open(txtout, 'w')
for k in url_d.keys(): for k in url_d.keys():
@@ -485,24 +450,39 @@ def run(args):
fwtxt.close() fwtxt.close()
return 42 return 42
# This is the end my friend. # This is the end my friend.
def main(): def main():
parser_desc = "%s %s %s in %s" % (name, version, author, date) parser_desc = "%s %s %s in %s" % (name, version, author, date)
parser = argparse.ArgumentParser(prog=name, description=parser_desc) parser = argparse.ArgumentParser(prog=name, description=parser_desc)
parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path") help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None) parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
help="grab pdf from specified url for analysis", default=None)
# parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None) # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
######### #########
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None) parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None) help="specify local path of pdf for analysis", default=None)
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None) parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int) help="specify local path of *directory* with pdf *files* for analysis", default=None)
parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True) parser.add_argument('-s', '--search', action='store', dest='search', required=False,
help="specify domain or tld to scrape for pdf-files", default=None)
parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
help="specify how many files are searched", default=10, type=int)
parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
help="if the target domain(s) run with old or bad certificates", default=True)
if len(sys.argv)<2:
parser.print_help(sys.stderr)
sys.exit()
args = parser.parse_args() args = parser.parse_args()
run(args) run(args)
if __name__ == "__main__": if __name__ == "__main__":
main() main()