release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list

2019-11-05 14:42:24 +01:00
parent fa3b925d6f
commit e1d7c3f760
7 changed files with 476 additions and 388 deletions
--- a/docs/Changelog
+++ b/docs/Changelog
@@ -0,0 +1,20 @@
+Changelog
+=========
+
+Version 4.7
+-----------
+
+* added html out
+* added xmp meta testing
+
+Version 4.6
+-----------
+
+* added help for non-argument given at cli
+* added googlesearch lib
+
+Version 4.5
+-----------
+
+* exported helper functions to libs/helper.py
+* added libs/liblog.py
--- a/docs/Todo
+++ b/docs/Todo
@@ -0,0 +1,4 @@
+* add xmp meta to output files
+* code reordering
+* clean up parsing functions
+* add report formats
--- a/libs/init.py
+++ b/libs/init.py
--- a/libs/libgoogle.py
+++ b/libs/libgoogle.py
@@ -0,0 +1,30 @@
+import googlesearch as gs
+import urllib
+from libs.libhelper import *
+
+def get_random_agent():
+    return (gs.get_random_user_agent())
+
+def search_pdf(search, args):
+    ''' the function where googlesearch from mario vilas
+		is called
+	'''
+
+    search_stop = args.search_stop
+
+    query = '%s filetype:pdf' % search
+    # print(query)
+    urls = []
+
+    try:
+        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
+            #print(url)
+            urls.append(url)
+
+    except urllib.error.HTTPError as e:
+        print('Error: %s' % e)
+        return -1
+
+
+    return urls
+
--- a/libs/libhelper.py
+++ b/libs/libhelper.py
@@ -0,0 +1,37 @@
+import os
+import sys
+from Crypto.Hash import SHA256
+
+def make_directory(outdir):
+    ''' naive mkdir function '''
+    try:
+        os.mkdir(outdir)
+    except:
+        # print("[W] mkdir, some error, directory probably exists")
+        pass
+
+def url_strip(url):
+    url = url.rstrip("\n")
+    url = url.rstrip("\r")
+    return url
+
+def create_sha256(hdata):
+    ''' introduced to create hashes of filenames, to have a uniqid
+		of course hashes of the file itself will be the next topic
+	'''
+    hobject = SHA256.new(data=hdata.encode())
+    return (hobject.hexdigest())
+
+def find_name(pdf):
+    ''' simply parses the urlencoded name and extracts the storage name
+		i would not be surprised this naive approach can lead to fuckups
+	'''
+
+    # find the name of the file
+    name = pdf.split("/")
+    a = len(name)
+    name = name[a - 1]
+    # print(name)
+
+    return name
+
--- a/libs/liblog.py
+++ b/libs/liblog.py
@@ -0,0 +1,17 @@
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+file_handler = logging.FileHandler('pdfgrab.log')
+
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.WARNING)
+
+formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
+
+file_handler.setFormatter(formatter)
+console_handler.setFormatter(formatter)
+
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -1,59 +1,33 @@
 #!/usr/bin/env python3
 #####################
-# yay - old tool adjusted for python3, using googlesearch now
-# and not some self crafted f00
-#
 # new features, new layout, new new :>
-# by dash at the end of September 2019
-# 
-# TODO
-# * add complete path in output as well as url where pdf came from
-# -> if url not exist like -F mode, then the local path
-# * clean up code
-# * fine tune google search
-# * add random timeout for new requests
-# -> maybe not necessary, gs has it ...
-# -> sort of necessary, on the other hand use proxychains man
-# * uh oh some fancy c0l0rs
-# * add thread support
-# * add scrape mode, to search for pdfs at the website itself
-# * add current error conditions to logfile
-#
-# Done
-# * add url list to output
-# * queues added, but no thread support yet
-# * json file output
-# * txt file output
-# * outfilename hardcoded
-# * add decryption routine
-# * catch ssl exceptions
-# * add random useragent for google and website pdf gathering
-# * set option for certificate verification, default is true
-# * catch conn refused connections
-# * catch filename to long thingy
+# by dash

-import os
-import sys
+import xml
+import argparse
 import json
+import os
 import queue
 import urllib
-import argparse
-import requests
+from json2html import *

-# remove somewhen ;)
-from IPython import embed
-
-from PyPDF2 import pdf
 import PyPDF2
-from Crypto.Hash import SHA256
-from collections import deque

 # googlesearch library
 import googlesearch as gs
+import requests
+from PyPDF2 import pdf
+
+# functions to extern files
+from libs.liblog import logger
+from libs.libhelper import *
+from libs.libgoogle import *
+
+from IPython import embed

 # some variables in regard of the tool itself
 name = 'pdfgrab'
-version 	= '0.4.4'
+version = '0.4.7'
 author = 'dash'
 date = '2019'

@@ -68,12 +42,16 @@ pdf_q = queue.Queue()
 # this is the analysis queue, keeping the data for further processing
 ana_q = queue.Queue()

-def create_sha256(hdata):
-	''' introduced to create hashes of filenames, to have a uniqid
-		of course hashes of the file itself will be the next topic
+def add_queue(tqueue, data):
+    ''' wrapper function for adding easy data to
+		created queues. otherwise the functions will be scattered with
+		endless queue commands ;)
 	'''
-	hobject = SHA256.new(data=hdata.encode())
-	return (hobject.hexdigest())
+
+    tqueue.put(data)
+    # d=tqueue.get()
+    #logging.debug(d)
+    return True

 def process_queue_data(filename, data, queue_type):
    ''' main function for processing gathered data
@@ -85,7 +63,7 @@ def process_queue_data(filename,data,queue_type):
    url_dict = {}

    if queue_type == 'doc_info':
-		print('[v] Queue DocInfo Data %s' % (filename))
+        logger.info('Queue DocInfo Data {0}'.format(filename))
        name = find_name(filename)
        path = filename

@@ -95,15 +73,19 @@ def process_queue_data(filename,data,queue_type):

        # order data in dict for analyse queue
        ana_dict = {path: {'filename': name, 'data': data}}
-#		print(data)
-#		print(ana_dict)
+        #print('data:',data)
+        #print('ana_dcit:',ana_dict)

        # add the data to queue
        add_queue(ana_q, ana_dict)

+    elif queue_type == 'doc_xmp_info':
+        logger.info('Queue DocXMPInfo Data {0}'.format(filename))
+        logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
+
    elif queue_type == 'url':
        # prepare queue entry
-		print('[v] Url Queue %s' % (data))
+        logger.info('Url Queue {0}'.format(data))
        url_dict = {'url': data, 'filename': filename}
        sha256 = create_sha256(data)
        url_d[sha256] = url_dict
@@ -113,28 +95,37 @@ def process_queue_data(filename,data,queue_type):

    else:
        print('[-] Sorry, unknown queue. DEBUG!')
+        logger.critical('Unknown queue')
        return False

    return True

-def add_queue(tqueue, data):
-	''' wrapper function for adding easy data to
-		created queues. otherwise the functions will be scattered with
-		endless queue commands ;)
+def get_xmp_meta_data(filename, filehandle):
+    ''' get the xmp meta data
    '''

-	tqueue.put(data)
-	#d=tqueue.get()
-	#print(d)
-	return True
+    err_dict = {}
+    real_extract = {}
+    xmp_dict = {}

-def url_strip(url):
-	url = url.rstrip("\n")
-	url = url.rstrip("\r")
-	return url
+    fh = filehandle

-def get_random_agent():
-	return (gs.get_random_user_agent())
+    try:
+        xmp_meta =  fh.getXmpMetadata()
+
+    except xml.parsers.expat.ExpatError as e:
+        print('Error: %s' % e)
+        err_dict = {'error': str(e)}
+        return -1
+
+    finally:
+        process_queue_data(filename, err_dict, 'doc_xmp_info')
+
+    if xmp_meta != None:
+        print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
+        xmp_dict = {}
+
+    return xmp_dict

 def get_DocInfo(filename, filehandle):
    ''' the easy way to extract metadata
@@ -183,7 +174,6 @@ def get_DocInfo(filename, filehandle):
        process_queue_data(filename, err_dict, 'doc_info')
        return -1

-
    try:
        for k in extract.keys():
            key = str(k)
@@ -200,7 +190,6 @@ def get_DocInfo(filename, filehandle):
        process_queue_data(filename, err_dict, 'doc_info')
        return -1

-
    process_queue_data(filename, real_extract, 'doc_info')


@@ -237,34 +226,16 @@ def check_encryption(filename):
        nfr = decrypt_empty_pdf(filename)
        if nfr != -1:
            get_DocInfo(filename, nfr)
+            get_xmp_meta_data(filename,nfr)

    else:
        get_DocInfo(filename, fr)
+        get_xmp_meta_data(filename,fr)

    # fr.close()

    return True

-def find_name(pdf):
-	''' simply parses the urlencoded name and extracts the storage name
-		i would not be surprised this naive approach can lead to fuckups
-	'''
-
-	#find the name of the file
-	name = pdf.split("/")
-	a = len(name)
-	name = name[a-1]
-	#print(name)
-
-	return name
-
-def make_directory(outdir):
-	''' naive mkdir function '''
-	try:
-		os.mkdir(outdir)
-	except:
-		#print("[W] mkdir, some error, directory probably exists")
-		pass

 def download_pdf(url, args, header_data):
    ''' downloading the pdfile for later analysis '''
@@ -293,10 +264,12 @@ def download_pdf(url, args, header_data):
    # print(len(data))
    return data

+
 def store_pdf(url, data, outdir):
    ''' storing the downloaded pdf data
    '''
-	print('[v] store_pdf')
+
+    logger.info('Store pdf')
    name = find_name(url)

    # only allow stored file a name with 50 chars
@@ -313,18 +286,20 @@ def store_pdf(url,data,outdir):
        return -1

    ret = f.write(data)
-	print('[+] Written %d bytes for File: %s' % (ret,save))
+    logger.info('Written {0} bytes for file: {1}'.format(ret,save))
    f.close()

    # return the savepath
    return save

+
 def _parse_pdf(filename):
    ''' the real parsing function '''

    ret = check_encryption(filename)
    return ret

+
 def grab_url(url, args, outdir):
    ''' function keeping all the steps for the user call of grabbing
 	just one pdf and analysing it
@@ -337,14 +312,18 @@ def grab_url(url, args, outdir):

    return

+
 def seek_and_analyse(search, args, outdir):
    ''' function for keeping all the steps of searching for pdfs and analysing
        them together
    '''
    # use the search function of googlesearch to get the results
-	search_pdf(search,args)
-	#urls = search_pdf(search,args)
+    urls=search_pdf(search, args)
+    for item in urls:
+        filename = find_name(item)
+        process_queue_data(filename, item, 'url')

+    # urls = search_pdf(search,args)

    # *if* we get an answer
    if url_q.empty() == False:
@@ -356,33 +335,13 @@ def seek_and_analyse(search,args,outdir):
            url = item['url']
            grab_url(url, args, outdir)

-def search_pdf(search, args):
-	''' the function where googlesearch from mario vilas
-		is called
-	'''

-	search_stop = args.search_stop
-
-	query='%s filetype:pdf' % search
-	#print(query)
-	urls = []
-
-	try:
-		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
-			#print(url)
-			# parse out the name of the file in the url	
-			filename=find_name(url)
-			# add the file to queue
-			process_queue_data(filename,url,'url')
-			urls.append(url)
-	
-	except urllib.error.HTTPError as e:
-		print('Error: %s' % e)
-		return -1
-	#return urls

 def run(args):

+    # initialize logger
+    logger.info('{0} Started'.format(name))
+
    # outfile name
    if args.outfile:
        out_filename = args.outfile
@@ -398,27 +357,27 @@ def run(args):
    # lets see what the object is
    if args.url_single:
        url = args.url_single
-		print('[+] Grabbing %s' % (url))
+        logger.info('Grabbing {0}'.format(url))
+        logger.write_to_log('Grabbing %s' % (url))
        grab_url(url, args, outdir)

    elif args.file_single:
        pdffile = args.file_single
-		print('[+] Parsing %s' % (pdffile))
+        logger.info('Parsing {0}'.format(pdffile))
        _parse_pdf(pdffile)

    elif args.search:
        search = args.search
-		#print(args)
-		print('[+] Seek and de...erm...analysing %s' % (search))
+        logger.info('Seek and analyse {0}'.format(search))
        seek_and_analyse(search, args, outdir)

    elif args.files_dir:
        directory = args.files_dir
-		print('[+] Analyse pdfs in directory %s' % (directory))
+        logger.info('Analyse pdfs in directory {0}'.format(directory))
        try:
            files = os.listdir(directory)
        except:
-			print('Error')
+            logger.warning('Error in args.files_dir')
            return False

        for f in files:
@@ -428,7 +387,7 @@ def run(args):
                _parse_pdf(fpath)

    else:
-		print('[-] Dunno what to do, bro.')
+        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))

    # move analysis dictionary in queue back to dictionary
    analysis_dict = {}
@@ -437,6 +396,7 @@ def run(args):
        # print('item ', item)
        analysis_dict.update(item)

+    #print('dict:',analysis_dict)
    # ana_q is empty now

    # create txt output
@@ -458,15 +418,21 @@ def run(args):
    # create json output
    jsonout = "%s/%s.json" % (outdir, out_filename)
    fwjson = open(jsonout, 'w')
-	#for k in analysis_dict.keys():
-		#print(analysis_dict[k])
-	#	jdata = json.dumps(analysis_dict[k])

    # print(analysis_dict)
    jdata = json.dumps(analysis_dict)
    fwjson.write(jdata)
    fwjson.close()

+    # create html from json
+    htmlout = "%s/%s.html" % (outdir, out_filename)
+    fwhtml = open(htmlout,'w')
+    #print(jdata)
+    html = json2html.convert(json = jdata)
+    fwhtml.write(html)
+    fwhtml.close()
+    
+
    # create url savefile
    # print('url_d: ', url_d)
    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
@@ -475,7 +441,6 @@ def run(args):
    fwjson.write(jdata)
    fwjson.close()

-
    txtout = "%s/%s_url.txt" % (outdir, out_filename)
    fwtxt = open(txtout, 'w')
    for k in url_d.keys():
@@ -485,24 +450,39 @@ def run(args):
    fwtxt.close()

    return 42
+
+
 # This is the end my friend.

 def main():
    parser_desc = "%s %s %s in %s" % (name, version, author, date)
    parser = argparse.ArgumentParser(prog=name, description=parser_desc)
-	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
-	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
-	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
+    parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
+                        help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
+    parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
+                        help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
+    parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
+                        help="grab pdf from specified url for analysis", default=None)
    # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
    #########
-	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
-	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
-	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
-	parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
-	parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
+    parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
+                        help="specify local path of pdf for analysis", default=None)
+    parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
+                        help="specify local path of *directory* with pdf *files* for analysis", default=None)
+    parser.add_argument('-s', '--search', action='store', dest='search', required=False,
+                        help="specify domain or tld to scrape for pdf-files", default=None)
+    parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
+                        help="specify how many files are searched", default=10, type=int)
+    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
+                        help="if the target domain(s) run with old or bad certificates", default=True)
+
+    if len(sys.argv)<2:
+        parser.print_help(sys.stderr)
+        sys.exit()

    args = parser.parse_args()
    run(args)

+
 if __name__ == "__main__":
    main()