release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list

2019-11-05 14:42:24 +01:00
parent fa3b925d6f
commit e1d7c3f760
7 changed files with 476 additions and 388 deletions
--- a/docs/Changelog
+++ b/docs/Changelog
@@ -0,0 +1,20 @@
 Changelog
 =========
 Version 4.7
 -----------
 * added html out
 * added xmp meta testing
 Version 4.6
 -----------
 * added help for non-argument given at cli
 * added googlesearch lib
 Version 4.5
 -----------
 * exported helper functions to libs/helper.py
 * added libs/liblog.py
--- a/docs/Todo
+++ b/docs/Todo
@@ -0,0 +1,4 @@
 * add xmp meta to output files
 * code reordering
 * clean up parsing functions
 * add report formats
--- a/libs/init.py
+++ b/libs/init.py
--- a/libs/libgoogle.py
+++ b/libs/libgoogle.py
@@ -0,0 +1,30 @@
 import googlesearch as gs
 import urllib
 from libs.libhelper import *
 def get_random_agent():
    return (gs.get_random_user_agent())
 def search_pdf(search, args):
    ''' the function where googlesearch from mario vilas
 		is called
 	'''
    search_stop = args.search_stop
    query = '%s filetype:pdf' % search
    # print(query)
    urls = []
    try:
        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
            #print(url)
            urls.append(url)
    except urllib.error.HTTPError as e:
        print('Error: %s' % e)
        return -1
    return urls
--- a/libs/libhelper.py
+++ b/libs/libhelper.py
@@ -0,0 +1,37 @@
 import os
 import sys
 from Crypto.Hash import SHA256
 def make_directory(outdir):
    ''' naive mkdir function '''
    try:
        os.mkdir(outdir)
    except:
        # print("[W] mkdir, some error, directory probably exists")
        pass
 def url_strip(url):
    url = url.rstrip("\n")
    url = url.rstrip("\r")
    return url
 def create_sha256(hdata):
    ''' introduced to create hashes of filenames, to have a uniqid
 		of course hashes of the file itself will be the next topic
 	'''
    hobject = SHA256.new(data=hdata.encode())
    return (hobject.hexdigest())
 def find_name(pdf):
    ''' simply parses the urlencoded name and extracts the storage name
 		i would not be surprised this naive approach can lead to fuckups
 	'''
    # find the name of the file
    name = pdf.split("/")
    a = len(name)
    name = name[a - 1]
    # print(name)
    return name
--- a/libs/liblog.py
+++ b/libs/liblog.py
@@ -0,0 +1,17 @@
 import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 file_handler = logging.FileHandler('pdfgrab.log')
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.WARNING)
 formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
 file_handler.setFormatter(formatter)
 console_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -1,59 +1,33 @@
 #!/usr/bin/env python3
 #####################
 # yay - old tool adjusted for python3, using googlesearch now
 # and not some self crafted f00
 #
 # new features, new layout, new new :>
-# by dash at the end of September 2019
+# by dash
 # 
 # TODO
 # * add complete path in output as well as url where pdf came from
 # -> if url not exist like -F mode, then the local path
 # * clean up code
 # * fine tune google search
 # * add random timeout for new requests
 # -> maybe not necessary, gs has it ...
 # -> sort of necessary, on the other hand use proxychains man
 # * uh oh some fancy c0l0rs
 # * add thread support
 # * add scrape mode, to search for pdfs at the website itself
 # * add current error conditions to logfile
 #
 # Done
 # * add url list to output
 # * queues added, but no thread support yet
 # * json file output
 # * txt file output
 # * outfilename hardcoded
 # * add decryption routine
 # * catch ssl exceptions
 # * add random useragent for google and website pdf gathering
 # * set option for certificate verification, default is true
 # * catch conn refused connections
 # * catch filename to long thingy
-import os
+import xml
-import sys
+import argparse
 import json
 import os
 import queue
 import urllib
-import argparse
+from json2html import *
 import requests
 # remove somewhen ;)
 from IPython import embed
 from PyPDF2 import pdf
 import PyPDF2
 from Crypto.Hash import SHA256
 from collections import deque
 # googlesearch library
 import googlesearch as gs
 import requests
 from PyPDF2 import pdf
 # functions to extern files
 from libs.liblog import logger
 from libs.libhelper import *
 from libs.libgoogle import *
 from IPython import embed
 # some variables in regard of the tool itself
 name = 'pdfgrab'
-version 	= '0.4.4'
+version = '0.4.7'
 author = 'dash'
 date = '2019'
@@ -68,12 +42,16 @@ pdf_q = queue.Queue()
 # this is the analysis queue, keeping the data for further processing
 ana_q = queue.Queue()
-def create_sha256(hdata):
+def add_queue(tqueue, data):
-	''' introduced to create hashes of filenames, to have a uniqid
+    ''' wrapper function for adding easy data to
-		of course hashes of the file itself will be the next topic
+		created queues. otherwise the functions will be scattered with
 		endless queue commands ;)
 	'''
-	hobject = SHA256.new(data=hdata.encode())
+
-	return (hobject.hexdigest())
+    tqueue.put(data)
    # d=tqueue.get()
    #logging.debug(d)
    return True
 def process_queue_data(filename, data, queue_type):
    ''' main function for processing gathered data
@@ -85,7 +63,7 @@ def process_queue_data(filename,data,queue_type):
    url_dict = {}
    if queue_type == 'doc_info':
-		print('[v] Queue DocInfo Data %s' % (filename))
+        logger.info('Queue DocInfo Data {0}'.format(filename))
        name = find_name(filename)
        path = filename
@@ -95,15 +73,19 @@ def process_queue_data(filename,data,queue_type):
        # order data in dict for analyse queue
        ana_dict = {path: {'filename': name, 'data': data}}
-#		print(data)
+        #print('data:',data)
-#		print(ana_dict)
+        #print('ana_dcit:',ana_dict)
        # add the data to queue
        add_queue(ana_q, ana_dict)
    elif queue_type == 'doc_xmp_info':
        logger.info('Queue DocXMPInfo Data {0}'.format(filename))
        logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
    elif queue_type == 'url':
        # prepare queue entry
-		print('[v] Url Queue %s' % (data))
+        logger.info('Url Queue {0}'.format(data))
        url_dict = {'url': data, 'filename': filename}
        sha256 = create_sha256(data)
        url_d[sha256] = url_dict
@@ -113,28 +95,37 @@ def process_queue_data(filename,data,queue_type):
    else:
        print('[-] Sorry, unknown queue. DEBUG!')
        logger.critical('Unknown queue')
        return False
    return True
-def add_queue(tqueue, data):
+def get_xmp_meta_data(filename, filehandle):
-	''' wrapper function for adding easy data to
+    ''' get the xmp meta data
 		created queues. otherwise the functions will be scattered with
 		endless queue commands ;)
    '''
-	tqueue.put(data)
+    err_dict = {}
-	#d=tqueue.get()
+    real_extract = {}
-	#print(d)
+    xmp_dict = {}
 	return True
-def url_strip(url):
+    fh = filehandle
 	url = url.rstrip("\n")
 	url = url.rstrip("\r")
 	return url
-def get_random_agent():
+    try:
-	return (gs.get_random_user_agent())
+        xmp_meta =  fh.getXmpMetadata()
    except xml.parsers.expat.ExpatError as e:
        print('Error: %s' % e)
        err_dict = {'error': str(e)}
        return -1
    finally:
        process_queue_data(filename, err_dict, 'doc_xmp_info')
    if xmp_meta != None:
        print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
        xmp_dict = {}
    return xmp_dict
 def get_DocInfo(filename, filehandle):
    ''' the easy way to extract metadata
@@ -183,7 +174,6 @@ def get_DocInfo(filename, filehandle):
        process_queue_data(filename, err_dict, 'doc_info')
        return -1
    try:
        for k in extract.keys():
            key = str(k)
@@ -200,7 +190,6 @@ def get_DocInfo(filename, filehandle):
        process_queue_data(filename, err_dict, 'doc_info')
        return -1
    process_queue_data(filename, real_extract, 'doc_info')
@@ -237,34 +226,16 @@ def check_encryption(filename):
        nfr = decrypt_empty_pdf(filename)
        if nfr != -1:
            get_DocInfo(filename, nfr)
            get_xmp_meta_data(filename,nfr)
    else:
        get_DocInfo(filename, fr)
        get_xmp_meta_data(filename,fr)
    # fr.close()
    return True
 def find_name(pdf):
 	''' simply parses the urlencoded name and extracts the storage name
 		i would not be surprised this naive approach can lead to fuckups
 	'''
 	#find the name of the file
 	name = pdf.split("/")
 	a = len(name)
 	name = name[a-1]
 	#print(name)
 	return name
 def make_directory(outdir):
 	''' naive mkdir function '''
 	try:
 		os.mkdir(outdir)
 	except:
 		#print("[W] mkdir, some error, directory probably exists")
 		pass
 def download_pdf(url, args, header_data):
    ''' downloading the pdfile for later analysis '''
@@ -293,10 +264,12 @@ def download_pdf(url, args, header_data):
    # print(len(data))
    return data
 def store_pdf(url, data, outdir):
    ''' storing the downloaded pdf data
    '''
-	print('[v] store_pdf')
+
    logger.info('Store pdf')
    name = find_name(url)
    # only allow stored file a name with 50 chars
@@ -313,18 +286,20 @@ def store_pdf(url,data,outdir):
        return -1
    ret = f.write(data)
-	print('[+] Written %d bytes for File: %s' % (ret,save))
+    logger.info('Written {0} bytes for file: {1}'.format(ret,save))
    f.close()
    # return the savepath
    return save
 def _parse_pdf(filename):
    ''' the real parsing function '''
    ret = check_encryption(filename)
    return ret
 def grab_url(url, args, outdir):
    ''' function keeping all the steps for the user call of grabbing
 	just one pdf and analysing it
@@ -337,14 +312,18 @@ def grab_url(url, args, outdir):
    return
 def seek_and_analyse(search, args, outdir):
    ''' function for keeping all the steps of searching for pdfs and analysing
        them together
    '''
    # use the search function of googlesearch to get the results
-	search_pdf(search,args)
+    urls=search_pdf(search, args)
-	#urls = search_pdf(search,args)
+    for item in urls:
        filename = find_name(item)
        process_queue_data(filename, item, 'url')
    # urls = search_pdf(search,args)
    # *if* we get an answer
    if url_q.empty() == False:
@@ -356,33 +335,13 @@ def seek_and_analyse(search,args,outdir):
            url = item['url']
            grab_url(url, args, outdir)
 def search_pdf(search, args):
 	''' the function where googlesearch from mario vilas
 		is called
 	'''
 	search_stop = args.search_stop
 	query='%s filetype:pdf' % search
 	#print(query)
 	urls = []
 	try:
 		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
 			#print(url)
 			# parse out the name of the file in the url	
 			filename=find_name(url)
 			# add the file to queue
 			process_queue_data(filename,url,'url')
 			urls.append(url)
 	except urllib.error.HTTPError as e:
 		print('Error: %s' % e)
 		return -1
 	#return urls
 def run(args):
    # initialize logger
    logger.info('{0} Started'.format(name))
    # outfile name
    if args.outfile:
        out_filename = args.outfile
@@ -398,27 +357,27 @@ def run(args):
    # lets see what the object is
    if args.url_single:
        url = args.url_single
-		print('[+] Grabbing %s' % (url))
+        logger.info('Grabbing {0}'.format(url))
        logger.write_to_log('Grabbing %s' % (url))
        grab_url(url, args, outdir)
    elif args.file_single:
        pdffile = args.file_single
-		print('[+] Parsing %s' % (pdffile))
+        logger.info('Parsing {0}'.format(pdffile))
        _parse_pdf(pdffile)
    elif args.search:
        search = args.search
-		#print(args)
+        logger.info('Seek and analyse {0}'.format(search))
 		print('[+] Seek and de...erm...analysing %s' % (search))
        seek_and_analyse(search, args, outdir)
    elif args.files_dir:
        directory = args.files_dir
-		print('[+] Analyse pdfs in directory %s' % (directory))
+        logger.info('Analyse pdfs in directory {0}'.format(directory))
        try:
            files = os.listdir(directory)
        except:
-			print('Error')
+            logger.warning('Error in args.files_dir')
            return False
        for f in files:
@@ -428,7 +387,7 @@ def run(args):
                _parse_pdf(fpath)
    else:
-		print('[-] Dunno what to do, bro.')
+        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
    # move analysis dictionary in queue back to dictionary
    analysis_dict = {}
@@ -437,6 +396,7 @@ def run(args):
        # print('item ', item)
        analysis_dict.update(item)
    #print('dict:',analysis_dict)
    # ana_q is empty now
    # create txt output
@@ -458,15 +418,21 @@ def run(args):
    # create json output
    jsonout = "%s/%s.json" % (outdir, out_filename)
    fwjson = open(jsonout, 'w')
 	#for k in analysis_dict.keys():
 		#print(analysis_dict[k])
 	#	jdata = json.dumps(analysis_dict[k])
    # print(analysis_dict)
    jdata = json.dumps(analysis_dict)
    fwjson.write(jdata)
    fwjson.close()
    # create html from json
    htmlout = "%s/%s.html" % (outdir, out_filename)
    fwhtml = open(htmlout,'w')
    #print(jdata)
    html = json2html.convert(json = jdata)
    fwhtml.write(html)
    fwhtml.close()
    # create url savefile
    # print('url_d: ', url_d)
    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
@@ -475,7 +441,6 @@ def run(args):
    fwjson.write(jdata)
    fwjson.close()
    txtout = "%s/%s_url.txt" % (outdir, out_filename)
    fwtxt = open(txtout, 'w')
    for k in url_d.keys():
@@ -485,24 +450,39 @@ def run(args):
    fwtxt.close()
    return 42
 # This is the end my friend.
 def main():
    parser_desc = "%s %s %s in %s" % (name, version, author, date)
    parser = argparse.ArgumentParser(prog=name, description=parser_desc)
-	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
+    parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
-	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
+                        help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
-	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
+    parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
                        help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
    parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
                        help="grab pdf from specified url for analysis", default=None)
    # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
    #########
-	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
+    parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
-	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
+                        help="specify local path of pdf for analysis", default=None)
-	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
+    parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
-	parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
+                        help="specify local path of *directory* with pdf *files* for analysis", default=None)
-	parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
+    parser.add_argument('-s', '--search', action='store', dest='search', required=False,
                        help="specify domain or tld to scrape for pdf-files", default=None)
    parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
                        help="specify how many files are searched", default=10, type=int)
    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
                        help="if the target domain(s) run with old or bad certificates", default=True)
    if len(sys.argv)<2:
        parser.print_help(sys.stderr)
        sys.exit()
    args = parser.parse_args()
    run(args)
 if __name__ == "__main__":
    main()