From e1d7c3f7600f7c5ca36585c7f958e0d4479f2fb1 Mon Sep 17 00:00:00 2001
From: c0decave <random.mail>
Date: Tue, 5 Nov 2019 14:42:24 +0100
Subject: [PATCH] release of version 0.4.7 added html reporting, added logging,
 reordered libraries, added experimental xmp meta data, fixed bug introduced
 due xmp meta data, added todo list

---
 docs/Changelog    |  20 ++
 docs/Todo         |   4 +
 libs/__init__.py  |   0
 libs/libgoogle.py |  30 ++
 libs/libhelper.py |  37 +++
 libs/liblog.py    |  17 ++
 pdfgrab.py        | 756 ++++++++++++++++++++++------------------------
 7 files changed, 476 insertions(+), 388 deletions(-)
 create mode 100644 docs/Changelog
 create mode 100644 docs/Todo
 create mode 100644 libs/__init__.py
 create mode 100644 libs/libgoogle.py
 create mode 100644 libs/libhelper.py
 create mode 100644 libs/liblog.py

diff --git a/docs/Changelog b/docs/Changelog
new file mode 100644
index 0000000..4fb9939
--- /dev/null
+++ b/docs/Changelog
@@ -0,0 +1,20 @@
+Changelog
+=========
+
+Version 4.7
+-----------
+
+* added html out
+* added xmp meta testing
+
+Version 4.6
+-----------
+
+* added help for non-argument given at cli
+* added googlesearch lib
+
+Version 4.5
+-----------
+
+* exported helper functions to libs/helper.py
+* added libs/liblog.py
diff --git a/docs/Todo b/docs/Todo
new file mode 100644
index 0000000..3dd5455
--- /dev/null
+++ b/docs/Todo
@@ -0,0 +1,4 @@
+* add xmp meta to output files
+* code reordering
+* clean up parsing functions
+* add report formats
diff --git a/libs/__init__.py b/libs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/libs/libgoogle.py b/libs/libgoogle.py
new file mode 100644
index 0000000..e374081
--- /dev/null
+++ b/libs/libgoogle.py
@@ -0,0 +1,30 @@
+import googlesearch as gs
+import urllib
+from libs.libhelper import *
+
+def get_random_agent():
+    return (gs.get_random_user_agent())
+
+def search_pdf(search, args):
+    ''' the function where googlesearch from mario vilas
+		is called
+	'''
+
+    search_stop = args.search_stop
+
+    query = '%s filetype:pdf' % search
+    # print(query)
+    urls = []
+
+    try:
+        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
+            #print(url)
+            urls.append(url)
+
+    except urllib.error.HTTPError as e:
+        print('Error: %s' % e)
+        return -1
+
+
+    return urls
+
diff --git a/libs/libhelper.py b/libs/libhelper.py
new file mode 100644
index 0000000..1861e8b
--- /dev/null
+++ b/libs/libhelper.py
@@ -0,0 +1,37 @@
+import os
+import sys
+from Crypto.Hash import SHA256
+
+def make_directory(outdir):
+    ''' naive mkdir function '''
+    try:
+        os.mkdir(outdir)
+    except:
+        # print("[W] mkdir, some error, directory probably exists")
+        pass
+
+def url_strip(url):
+    url = url.rstrip("\n")
+    url = url.rstrip("\r")
+    return url
+
+def create_sha256(hdata):
+    ''' introduced to create hashes of filenames, to have a uniqid
+		of course hashes of the file itself will be the next topic
+	'''
+    hobject = SHA256.new(data=hdata.encode())
+    return (hobject.hexdigest())
+
+def find_name(pdf):
+    ''' simply parses the urlencoded name and extracts the storage name
+		i would not be surprised this naive approach can lead to fuckups
+	'''
+
+    # find the name of the file
+    name = pdf.split("/")
+    a = len(name)
+    name = name[a - 1]
+    # print(name)
+
+    return name
+
diff --git a/libs/liblog.py b/libs/liblog.py
new file mode 100644
index 0000000..2c01d25
--- /dev/null
+++ b/libs/liblog.py
@@ -0,0 +1,17 @@
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+file_handler = logging.FileHandler('pdfgrab.log')
+
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.WARNING)
+
+formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
+
+file_handler.setFormatter(formatter)
+console_handler.setFormatter(formatter)
+
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
diff --git a/pdfgrab.py b/pdfgrab.py
index 9ec84b0..41c9e14 100755
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -1,61 +1,35 @@
 #!/usr/bin/env python3
 #####################
-# yay - old tool adjusted for python3, using googlesearch now
-# and not some self crafted f00
-#
 # new features, new layout, new new :>
-# by dash at the end of September 2019
-# 
-# TODO
-# * add complete path in output as well as url where pdf came from
-# -> if url not exist like -F mode, then the local path
-# * clean up code
-# * fine tune google search
-# * add random timeout for new requests
-# -> maybe not necessary, gs has it ...
-# -> sort of necessary, on the other hand use proxychains man
-# * uh oh some fancy c0l0rs
-# * add thread support
-# * add scrape mode, to search for pdfs at the website itself
-# * add current error conditions to logfile
-#
-# Done
-# * add url list to output
-# * queues added, but no thread support yet
-# * json file output
-# * txt file output
-# * outfilename hardcoded
-# * add decryption routine
-# * catch ssl exceptions
-# * add random useragent for google and website pdf gathering
-# * set option for certificate verification, default is true
-# * catch conn refused connections
-# * catch filename to long thingy
+# by dash
 
-import os
-import sys
+import xml
+import argparse
 import json
+import os
 import queue
 import urllib
-import argparse
-import requests
+from json2html import *
 
-# remove somewhen ;)
-from IPython import embed
-
-from PyPDF2 import pdf
 import PyPDF2
-from Crypto.Hash import SHA256
-from collections import deque
 
 # googlesearch library
 import googlesearch as gs
+import requests
+from PyPDF2 import pdf
+
+# functions to extern files
+from libs.liblog import logger
+from libs.libhelper import *
+from libs.libgoogle import *
+
+from IPython import embed
 
 # some variables in regard of the tool itself
-name 		= 'pdfgrab'
-version 	= '0.4.4'
-author		= 'dash'
-date		= '2019'
+name = 'pdfgrab'
+version = '0.4.7'
+author = 'dash'
+date = '2019'
 
 # queues for processing
 # this queue holds the URL locations of files to download
@@ -68,76 +42,93 @@ pdf_q = queue.Queue()
 # this is the analysis queue, keeping the data for further processing
 ana_q = queue.Queue()
 
-def create_sha256(hdata):
-	''' introduced to create hashes of filenames, to have a uniqid
-		of course hashes of the file itself will be the next topic
-	'''
-	hobject = SHA256.new(data=hdata.encode())
-	return (hobject.hexdigest())
-
-def process_queue_data(filename,data,queue_type):
-	''' main function for processing gathered data
-		i use this central function for it, so it is at *one* place
-		and it is easy to change the data handling at a later step without
-		deconstructing the who code
-	'''
-	ana_dict = {}
-	url_dict = {}
-
-	if queue_type=='doc_info':
-		print('[v] Queue DocInfo Data %s' % (filename))
-		name = find_name(filename)
-		path = filename
-
-		# create a hash over the file path
-		# hm, removed for now
-		#path_hash = create_sha256(path)
-
-		# order data in dict for analyse queue
-		ana_dict = {path : {'filename':name,'data':data}}
-#		print(data)
-#		print(ana_dict)
-
-		# add the data to queue
-		add_queue(ana_q,ana_dict)
-
-	elif queue_type=='url':
-		# prepare queue entry
-		print('[v] Url Queue %s' % (data))
-		url_dict = {'url':data,'filename':filename}
-		sha256=create_sha256(data)
-		url_d[sha256]=url_dict
-
-		# add dict to queue
-		add_queue(url_q,url_dict)
-
-	else:
-		print('[-] Sorry, unknown queue. DEBUG!')
-		return False
-	
-	return True
-
 def add_queue(tqueue, data):
-	''' wrapper function for adding easy data to
+    ''' wrapper function for adding easy data to
 		created queues. otherwise the functions will be scattered with
 		endless queue commands ;)
 	'''
 
-	tqueue.put(data)
-	#d=tqueue.get()
-	#print(d)
-	return True
+    tqueue.put(data)
+    # d=tqueue.get()
+    #logging.debug(d)
+    return True
 
-def url_strip(url):
-	url = url.rstrip("\n")
-	url = url.rstrip("\r")
-	return url
+def process_queue_data(filename, data, queue_type):
+    ''' main function for processing gathered data
+		i use this central function for it, so it is at *one* place
+		and it is easy to change the data handling at a later step without
+		deconstructing the who code
+    '''
+    ana_dict = {}
+    url_dict = {}
 
-def get_random_agent():
-	return (gs.get_random_user_agent())
+    if queue_type == 'doc_info':
+        logger.info('Queue DocInfo Data {0}'.format(filename))
+        name = find_name(filename)
+        path = filename
+
+        # create a hash over the file path
+        # hm, removed for now
+        # path_hash = create_sha256(path)
+
+        # order data in dict for analyse queue
+        ana_dict = {path: {'filename': name, 'data': data}}
+        #print('data:',data)
+        #print('ana_dcit:',ana_dict)
+
+        # add the data to queue
+        add_queue(ana_q, ana_dict)
+
+    elif queue_type == 'doc_xmp_info':
+        logger.info('Queue DocXMPInfo Data {0}'.format(filename))
+        logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
+
+    elif queue_type == 'url':
+        # prepare queue entry
+        logger.info('Url Queue {0}'.format(data))
+        url_dict = {'url': data, 'filename': filename}
+        sha256 = create_sha256(data)
+        url_d[sha256] = url_dict
+
+        # add dict to queue
+        add_queue(url_q, url_dict)
+
+    else:
+        print('[-] Sorry, unknown queue. DEBUG!')
+        logger.critical('Unknown queue')
+        return False
+
+    return True
+
+def get_xmp_meta_data(filename, filehandle):
+    ''' get the xmp meta data
+    '''
+
+    err_dict = {}
+    real_extract = {}
+    xmp_dict = {}
+
+    fh = filehandle
+
+    try:
+        xmp_meta =  fh.getXmpMetadata()
+
+    except xml.parsers.expat.ExpatError as e:
+        print('Error: %s' % e)
+        err_dict = {'error': str(e)}
+        return -1
+
+    finally:
+        process_queue_data(filename, err_dict, 'doc_xmp_info')
+
+    if xmp_meta != None:
+        print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
+        xmp_dict = {}
+
+    return xmp_dict
 
 def get_DocInfo(filename, filehandle):
-	''' the easy way to extract metadata
+    ''' the easy way to extract metadata
 		
 		indirectObjects...
 		there is an interesting situation, some pdfs seem to have the same information stored 
@@ -152,357 +143,346 @@ def get_DocInfo(filename, filehandle):
 		bad example:
 	'''
 
-	err_dict = {}
-	real_extract = {}
+    err_dict = {}
+    real_extract = {}
 
-	fh = filehandle
+    fh = filehandle
 
-	try:
-		extract = fh.documentInfo
+    try:
+        extract = fh.documentInfo
 
-	except pdf.utils.PdfReadError as e:
-		print('Error: %s' % e)
-		err_dict={'error':str(e)}
-		return -1
+    except pdf.utils.PdfReadError as e:
+        print('Error: %s' % e)
+        err_dict = {'error': str(e)}
+        return -1
 
-	except PyPDF2.utils.PdfReadError as e:
-		print('Error: %s' % e)
-		err_dict={'error':str(e)}
-		return -1
+    except PyPDF2.utils.PdfReadError as e:
+        print('Error: %s' % e)
+        err_dict = {'error': str(e)}
+        return -1
 
-	finally:
-		process_queue_data(filename,err_dict,'doc_info')
+    finally:
+        process_queue_data(filename, err_dict, 'doc_info')
 
-	print('-'*80)
-	print('File: %s' % filename)
-#	embed()
-	# there are situations when documentinfo does not return anything
-	# and extract is None
-	if extract==None:
-		err_dict={'error':'getDocumentInfo() returns None'}
-		process_queue_data(filename,err_dict,'doc_info')
-		return -1
+    print('-' * 80)
+    print('File: %s' % filename)
+    #	embed()
+    # there are situations when documentinfo does not return anything
+    # and extract is None
+    if extract == None:
+        err_dict = {'error': 'getDocumentInfo() returns None'}
+        process_queue_data(filename, err_dict, 'doc_info')
+        return -1
 
+    try:
+        for k in extract.keys():
+            key = str(k)
+            value = str(extract[k])
+            edata = '%s %s' % (key, value)
+            print(edata)
+            print
+            real_extract[key] = value
+        print('-' * 80)
 
-	try:
-		for k in extract.keys():
-			key = str(k)
-			value = str(extract[k])
-			edata = '%s %s' % (key,value)
-			print(edata)
-			print
-			real_extract[key]=value
-		print('-'*80)
+    except PyPDF2.utils.PdfReadError as e:
+        print('Error: %s' % e)
+        err_dict = {'error': str(e)}
+        process_queue_data(filename, err_dict, 'doc_info')
+        return -1
 
-	except PyPDF2.utils.PdfReadError as e:
-		print('Error: %s' % e)
-		err_dict={'error':str(e)}
-		process_queue_data(filename,err_dict,'doc_info')
-		return -1
-
-
-	process_queue_data(filename,real_extract,'doc_info')
+    process_queue_data(filename, real_extract, 'doc_info')
 
 
 def decrypt_empty_pdf(filename):
-	''' this function simply tries to decrypt the pdf with the null password
+    ''' this function simply tries to decrypt the pdf with the null password
 		this does work, as long as no real password has been set
 		if a complex password has been set -> john
 	'''
 
-	fr = pdf.PdfFileReader(open(filename,"rb"))
-	try:
-		fr.decrypt('')
+    fr = pdf.PdfFileReader(open(filename, "rb"))
+    try:
+        fr.decrypt('')
+
+    except NotImplementedError as e:
+        # print('Error: %s' % (e))
+        print('Error: File: %s encrypted. %s' % (filename, str(e)))
+        return -1
+    return fr
 
-	except NotImplementedError as e:
-		#print('Error: %s' % (e))
-		print('Error: File: %s encrypted. %s' % (filename,str(e)))
-		return -1
-	return fr
-	
 
 def check_encryption(filename):
-	''' basic function to check if file is encrypted 
+    ''' basic function to check if file is encrypted
 	'''
 
-#	print(filename)
-	try:
-		fr = pdf.PdfFileReader(open(filename,"rb"))
-	except pdf.utils.PdfReadError as e:
-		print('Error: %s' % e)
-		return -1
+    #	print(filename)
+    try:
+        fr = pdf.PdfFileReader(open(filename, "rb"))
+    except pdf.utils.PdfReadError as e:
+        print('Error: %s' % e)
+        return -1
 
-	if fr.getIsEncrypted()==True:
-		print('[i] File encrypted %s' % filename)
-		nfr = decrypt_empty_pdf(filename)
-		if nfr != -1:
-			get_DocInfo(filename,nfr)
+    if fr.getIsEncrypted() == True:
+        print('[i] File encrypted %s' % filename)
+        nfr = decrypt_empty_pdf(filename)
+        if nfr != -1:
+            get_DocInfo(filename, nfr)
+            get_xmp_meta_data(filename,nfr)
 
-	else:
-		get_DocInfo(filename,fr)
+    else:
+        get_DocInfo(filename, fr)
+        get_xmp_meta_data(filename,fr)
 
-	#fr.close()
+    # fr.close()
 
-	return True
+    return True
 
-def find_name(pdf):
-	''' simply parses the urlencoded name and extracts the storage name
-		i would not be surprised this naive approach can lead to fuckups
-	'''
-
-	#find the name of the file
-	name = pdf.split("/")
-	a = len(name)
-	name = name[a-1]
-	#print(name)
-
-	return name
-
-def make_directory(outdir):
-	''' naive mkdir function '''
-	try:
-		os.mkdir(outdir)
-	except:
-		#print("[W] mkdir, some error, directory probably exists")
-		pass
 
 def download_pdf(url, args, header_data):
-	''' downloading the pdfile for later analysis '''
+    ''' downloading the pdfile for later analysis '''
 
-	# check the remote tls certificate or not?
-	cert_check = args.cert_check
+    # check the remote tls certificate or not?
+    cert_check = args.cert_check
 
-	try:
-		req = requests.get(url,headers=header_data,verify=cert_check)
-		#req = requests.get(url,headers=header_data,verify=False)
-		data = req.content
-		status_code = req.status_code
+    try:
+        req = requests.get(url, headers=header_data, verify=cert_check)
+        # req = requests.get(url,headers=header_data,verify=False)
+        data = req.content
+        status_code = req.status_code
 
-	except requests.exceptions.SSLError as e:
-		print('Error: %s' % e)
-		return -1
+    except requests.exceptions.SSLError as e:
+        print('Error: %s' % e)
+        return -1
 
-	except:
-		print('Error: Probably something wrong with remote server')
-		return -1
+    except:
+        print('Error: Probably something wrong with remote server')
+        return -1
 
-	if status_code == 403:
-		print('%s http/403 Forbidden' % (url))
-		return -1
+    if status_code == 403:
+        print('%s http/403 Forbidden' % (url))
+        return -1
 
-	#print(len(data))
-	return data
+    # print(len(data))
+    return data
 
-def store_pdf(url,data,outdir):
-	''' storing the downloaded pdf data 
-	'''
-	print('[v] store_pdf')
-	name = find_name(url)
 
-	# only allow stored file a name with 50 chars
-	if len(name)>50:
-		name = name[:49] + '.pdf'
-	#print(len(name))
+def store_pdf(url, data, outdir):
+    ''' storing the downloaded pdf data
+    '''
 
-	save = "%s/%s" % (outdir,name)
+    logger.info('Store pdf')
+    name = find_name(url)
 
-	try:
-		f = open(save,"wb")
-	except OSError as e:
-		print('Error: %s' % (e))
-		return -1
+    # only allow stored file a name with 50 chars
+    if len(name) > 50:
+        name = name[:49] + '.pdf'
+    # print(len(name))
+
+    save = "%s/%s" % (outdir, name)
+
+    try:
+        f = open(save, "wb")
+    except OSError as e:
+        print('Error: %s' % (e))
+        return -1
+
+    ret = f.write(data)
+    logger.info('Written {0} bytes for file: {1}'.format(ret,save))
+    f.close()
+
+    # return the savepath
+    return save
 
-	ret=f.write(data)
-	print('[+] Written %d bytes for File: %s' % (ret,save))
-	f.close()
-	
-	# return the savepath
-	return save
 
 def _parse_pdf(filename):
-	''' the real parsing function '''
+    ''' the real parsing function '''
+
+    ret = check_encryption(filename)
+    return ret
 
-	ret = check_encryption(filename)
-	return ret
 
 def grab_url(url, args, outdir):
-	''' function keeping all the steps for the user call of grabbing 
-		just one pdf and analysing it
-	'''
-	header_data={'User-Agent':get_random_agent()}
-	data = download_pdf(url,args, header_data)
-	if data != -1:
-		savepath = store_pdf(url, data, outdir)
-		_parse_pdf(savepath)
+    ''' function keeping all the steps for the user call of grabbing
+	just one pdf and analysing it
+    '''
+    header_data = {'User-Agent': get_random_agent()}
+    data = download_pdf(url, args, header_data)
+    if data != -1:
+        savepath = store_pdf(url, data, outdir)
+        _parse_pdf(savepath)
 
-	return
-
-def seek_and_analyse(search,args,outdir):
-	''' function for keeping all the steps of searching for pdfs and analysing
-		them together
-	'''
-	# use the search function of googlesearch to get the results
-	search_pdf(search,args)
-	#urls = search_pdf(search,args)
+    return
 
 
-	# *if* we get an answer
-	if url_q.empty()==False:
-	#if urls != -1:
-		# process through the list and get the pdfs
-		while url_q.empty()==False:
-			item=url_q.get()
-			#print(item)
-			url = item['url']
-			grab_url(url,args,outdir)
+def seek_and_analyse(search, args, outdir):
+    ''' function for keeping all the steps of searching for pdfs and analysing
+        them together
+    '''
+    # use the search function of googlesearch to get the results
+    urls=search_pdf(search, args)
+    for item in urls:
+        filename = find_name(item)
+        process_queue_data(filename, item, 'url')
 
-def search_pdf(search, args):
-	''' the function where googlesearch from mario vilas
-		is called
-	'''
+    # urls = search_pdf(search,args)
 
-	search_stop = args.search_stop
+    # *if* we get an answer
+    if url_q.empty() == False:
+        # if urls != -1:
+        # process through the list and get the pdfs
+        while url_q.empty() == False:
+            item = url_q.get()
+            # print(item)
+            url = item['url']
+            grab_url(url, args, outdir)
 
-	query='%s filetype:pdf' % search
-	#print(query)
-	urls = []
 
-	try:
-		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
-			#print(url)
-			# parse out the name of the file in the url	
-			filename=find_name(url)
-			# add the file to queue
-			process_queue_data(filename,url,'url')
-			urls.append(url)
-	
-	except urllib.error.HTTPError as e:
-		print('Error: %s' % e)
-		return -1
-	#return urls
 
 def run(args):
 
-	# outfile name
-	if args.outfile:
-		out_filename = args.outfile
-	else:
-		out_filename = 'pdfgrab_analysis'
+    # initialize logger
+    logger.info('{0} Started'.format(name))
 
-	# specify output directory
-	outdir = args.outdir
+    # outfile name
+    if args.outfile:
+        out_filename = args.outfile
+    else:
+        out_filename = 'pdfgrab_analysis'
 
-	# create output directory
-	make_directory(outdir)
+    # specify output directory
+    outdir = args.outdir
 
-	# lets see what the object is
-	if args.url_single:
-		url = args.url_single
-		print('[+] Grabbing %s' % (url))
-		grab_url(url, args,outdir)
+    # create output directory
+    make_directory(outdir)
 
-	elif args.file_single:
-		pdffile = args.file_single
-		print('[+] Parsing %s' % (pdffile))
-		_parse_pdf(pdffile)
+    # lets see what the object is
+    if args.url_single:
+        url = args.url_single
+        logger.info('Grabbing {0}'.format(url))
+        logger.write_to_log('Grabbing %s' % (url))
+        grab_url(url, args, outdir)
 
-	elif args.search:
-		search = args.search
-		#print(args)
-		print('[+] Seek and de...erm...analysing %s' % (search))
-		seek_and_analyse(search,args,outdir)
-	
-	elif args.files_dir:
-		directory = args.files_dir
-		print('[+] Analyse pdfs in directory %s' % (directory))
-		try:
-			files = os.listdir(directory)
-		except:
-			print('Error')
-			return False
+    elif args.file_single:
+        pdffile = args.file_single
+        logger.info('Parsing {0}'.format(pdffile))
+        _parse_pdf(pdffile)
 
-		for f in files:
-			# naive filter function, later usage of filemagic possible
-			if f.find('.pdf')!=-1:
-				fpath = '%s/%s' % (directory,f)
-				_parse_pdf(fpath)
+    elif args.search:
+        search = args.search
+        logger.info('Seek and analyse {0}'.format(search))
+        seek_and_analyse(search, args, outdir)
 
-	else:
-		print('[-] Dunno what to do, bro.')
-	
-	# move analysis dictionary in queue back to dictionary
-	analysis_dict = {}
-	while ana_q.empty()==False:
-		item = ana_q.get()
-		#print('item ', item)
-		analysis_dict.update(item)
-	
-	# ana_q is empty now
+    elif args.files_dir:
+        directory = args.files_dir
+        logger.info('Analyse pdfs in directory {0}'.format(directory))
+        try:
+            files = os.listdir(directory)
+        except:
+            logger.warning('Error in args.files_dir')
+            return False
 
-	# create txt output
-	sep = '-'*80 + '\n'
-	txtout = "%s/%s.txt" % (outdir,out_filename)
-	fwtxt = open(txtout,'w')
-	#print(analysis_dict)
-	for k in analysis_dict.keys():
-		fwtxt.write(sep)
-		fname = 'File: %s\n' % (analysis_dict[k]['filename'])
-		ddata = analysis_dict[k]['data']
-		fwtxt.write(fname)
-		for kdata in ddata.keys():
-			metatxt = '%s:%s\n' % (kdata,ddata[kdata])
-			fwtxt.write(metatxt)
-		fwtxt.write(sep)
-	fwtxt.close()
-	
-	# create json output
-	jsonout = "%s/%s.json" % (outdir,out_filename)
-	fwjson = open(jsonout,'w')
-	#for k in analysis_dict.keys():
-		#print(analysis_dict[k])
-	#	jdata = json.dumps(analysis_dict[k])
+        for f in files:
+            # naive filter function, later usage of filemagic possible
+            if f.find('.pdf') != -1:
+                fpath = '%s/%s' % (directory, f)
+                _parse_pdf(fpath)
 
-	#print(analysis_dict)
-	jdata = json.dumps(analysis_dict)
-	fwjson.write(jdata)
-	fwjson.close()
+    else:
+        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
 
-	# create url savefile
-	#print('url_d: ', url_d)
-	jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
-	fwjson = open(jsonurlout,'w')
-	jdata = json.dumps(url_d)
-	fwjson.write(jdata)
-	fwjson.close()
+    # move analysis dictionary in queue back to dictionary
+    analysis_dict = {}
+    while ana_q.empty() == False:
+        item = ana_q.get()
+        # print('item ', item)
+        analysis_dict.update(item)
+
+    #print('dict:',analysis_dict)
+    # ana_q is empty now
+
+    # create txt output
+    sep = '-' * 80 + '\n'
+    txtout = "%s/%s.txt" % (outdir, out_filename)
+    fwtxt = open(txtout, 'w')
+    # print(analysis_dict)
+    for k in analysis_dict.keys():
+        fwtxt.write(sep)
+        fname = 'File: %s\n' % (analysis_dict[k]['filename'])
+        ddata = analysis_dict[k]['data']
+        fwtxt.write(fname)
+        for kdata in ddata.keys():
+            metatxt = '%s:%s\n' % (kdata, ddata[kdata])
+            fwtxt.write(metatxt)
+        fwtxt.write(sep)
+    fwtxt.close()
+
+    # create json output
+    jsonout = "%s/%s.json" % (outdir, out_filename)
+    fwjson = open(jsonout, 'w')
+
+    # print(analysis_dict)
+    jdata = json.dumps(analysis_dict)
+    fwjson.write(jdata)
+    fwjson.close()
+
+    # create html from json
+    htmlout = "%s/%s.html" % (outdir, out_filename)
+    fwhtml = open(htmlout,'w')
+    #print(jdata)
+    html = json2html.convert(json = jdata)
+    fwhtml.write(html)
+    fwhtml.close()
+    
+
+    # create url savefile
+    # print('url_d: ', url_d)
+    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
+    fwjson = open(jsonurlout, 'w')
+    jdata = json.dumps(url_d)
+    fwjson.write(jdata)
+    fwjson.close()
+
+    txtout = "%s/%s_url.txt" % (outdir, out_filename)
+    fwtxt = open(txtout, 'w')
+    for k in url_d.keys():
+        ddata = url_d[k]
+        metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
+        fwtxt.write(metatxt)
+    fwtxt.close()
+
+    return 42
 
 
-	txtout = "%s/%s_url.txt" % (outdir,out_filename)
-	fwtxt = open(txtout,'w')
-	for k in url_d.keys():
-		ddata = url_d[k]
-		metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
-		fwtxt.write(metatxt)
-	fwtxt.close()
-
-	return 42
-	# This is the end my friend.
+# This is the end my friend.
 
 def main():
-	parser_desc = "%s %s %s in %s" % (name,version,author,date)
-	parser = argparse.ArgumentParser(prog = name, description=parser_desc)
-	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
-	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
-	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
-	#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
-#########
-	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
-	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
-	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
-	parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
-	parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
+    parser_desc = "%s %s %s in %s" % (name, version, author, date)
+    parser = argparse.ArgumentParser(prog=name, description=parser_desc)
+    parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
+                        help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
+    parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
+                        help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
+    parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
+                        help="grab pdf from specified url for analysis", default=None)
+    # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
+    #########
+    parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
+                        help="specify local path of pdf for analysis", default=None)
+    parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
+                        help="specify local path of *directory* with pdf *files* for analysis", default=None)
+    parser.add_argument('-s', '--search', action='store', dest='search', required=False,
+                        help="specify domain or tld to scrape for pdf-files", default=None)
+    parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
+                        help="specify how many files are searched", default=10, type=int)
+    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
+                        help="if the target domain(s) run with old or bad certificates", default=True)
+
+    if len(sys.argv)<2:
+        parser.print_help(sys.stderr)
+        sys.exit()
+
+    args = parser.parse_args()
+    run(args)
 
-	args = parser.parse_args()
-	run(args)
 
 if __name__ == "__main__":
-	main()
+    main()