release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list

2019-11-05 14:42:24 +01:00
parent fa3b925d6f
commit e1d7c3f760
7 changed files with 476 additions and 388 deletions
--- a/docs/Changelog
+++ b/docs/Changelog
@@ -0,0 +1,20 @@
 Changelog
 =========
 Version 4.7
 -----------
 * added html out
 * added xmp meta testing
 Version 4.6
 -----------
 * added help for non-argument given at cli
 * added googlesearch lib
 Version 4.5
 -----------
 * exported helper functions to libs/helper.py
 * added libs/liblog.py
--- a/docs/Todo
+++ b/docs/Todo
@@ -0,0 +1,4 @@
 * add xmp meta to output files
 * code reordering
 * clean up parsing functions
 * add report formats
--- a/libs/init.py
+++ b/libs/init.py
--- a/libs/libgoogle.py
+++ b/libs/libgoogle.py
@@ -0,0 +1,30 @@
 import googlesearch as gs
 import urllib
 from libs.libhelper import *
 def get_random_agent():
    return (gs.get_random_user_agent())
 def search_pdf(search, args):
    ''' the function where googlesearch from mario vilas
 		is called
 	'''
    search_stop = args.search_stop
    query = '%s filetype:pdf' % search
    # print(query)
    urls = []
    try:
        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
            #print(url)
            urls.append(url)
    except urllib.error.HTTPError as e:
        print('Error: %s' % e)
        return -1
    return urls
--- a/libs/libhelper.py
+++ b/libs/libhelper.py
@@ -0,0 +1,37 @@
 import os
 import sys
 from Crypto.Hash import SHA256
 def make_directory(outdir):
    ''' naive mkdir function '''
    try:
        os.mkdir(outdir)
    except:
        # print("[W] mkdir, some error, directory probably exists")
        pass
 def url_strip(url):
    url = url.rstrip("\n")
    url = url.rstrip("\r")
    return url
 def create_sha256(hdata):
    ''' introduced to create hashes of filenames, to have a uniqid
 		of course hashes of the file itself will be the next topic
 	'''
    hobject = SHA256.new(data=hdata.encode())
    return (hobject.hexdigest())
 def find_name(pdf):
    ''' simply parses the urlencoded name and extracts the storage name
 		i would not be surprised this naive approach can lead to fuckups
 	'''
    # find the name of the file
    name = pdf.split("/")
    a = len(name)
    name = name[a - 1]
    # print(name)
    return name
--- a/libs/liblog.py
+++ b/libs/liblog.py
@@ -0,0 +1,17 @@
 import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 file_handler = logging.FileHandler('pdfgrab.log')
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.WARNING)
 formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
 file_handler.setFormatter(formatter)
 console_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -1,61 +1,35 @@
 #!/usr/bin/env python3
 #####################
 # yay - old tool adjusted for python3, using googlesearch now
 # and not some self crafted f00
 #
 # new features, new layout, new new :>
-# by dash at the end of September 2019
+# by dash
 # 
 # TODO
 # * add complete path in output as well as url where pdf came from
 # -> if url not exist like -F mode, then the local path
 # * clean up code
 # * fine tune google search
 # * add random timeout for new requests
 # -> maybe not necessary, gs has it ...
 # -> sort of necessary, on the other hand use proxychains man
 # * uh oh some fancy c0l0rs
 # * add thread support
 # * add scrape mode, to search for pdfs at the website itself
 # * add current error conditions to logfile
 #
 # Done
 # * add url list to output
 # * queues added, but no thread support yet
 # * json file output
 # * txt file output
 # * outfilename hardcoded
 # * add decryption routine
 # * catch ssl exceptions
 # * add random useragent for google and website pdf gathering
 # * set option for certificate verification, default is true
 # * catch conn refused connections
 # * catch filename to long thingy
-import os
+import xml
-import sys
+import argparse
 import json
 import os
 import queue
 import urllib
-import argparse
+from json2html import *
 import requests
 # remove somewhen ;)
 from IPython import embed
 from PyPDF2 import pdf
 import PyPDF2
 from Crypto.Hash import SHA256
 from collections import deque
 # googlesearch library
 import googlesearch as gs
 import requests
 from PyPDF2 import pdf
 # functions to extern files
 from libs.liblog import logger
 from libs.libhelper import *
 from libs.libgoogle import *
 from IPython import embed
 # some variables in regard of the tool itself
-name 		= 'pdfgrab'
+name = 'pdfgrab'
-version 	= '0.4.4'
+version = '0.4.7'
-author		= 'dash'
+author = 'dash'
-date		= '2019'
+date = '2019'
 # queues for processing
 # this queue holds the URL locations of files to download
@@ -68,76 +42,93 @@ pdf_q = queue.Queue()
 # this is the analysis queue, keeping the data for further processing
 ana_q = queue.Queue()
 def create_sha256(hdata):
 	''' introduced to create hashes of filenames, to have a uniqid
 		of course hashes of the file itself will be the next topic
 	'''
 	hobject = SHA256.new(data=hdata.encode())
 	return (hobject.hexdigest())
 def process_queue_data(filename,data,queue_type):
 	''' main function for processing gathered data
 		i use this central function for it, so it is at *one* place
 		and it is easy to change the data handling at a later step without
 		deconstructing the who code
 	'''
 	ana_dict = {}
 	url_dict = {}
 	if queue_type=='doc_info':
 		print('[v] Queue DocInfo Data %s' % (filename))
 		name = find_name(filename)
 		path = filename
 		# create a hash over the file path
 		# hm, removed for now
 		#path_hash = create_sha256(path)
 		# order data in dict for analyse queue
 		ana_dict = {path : {'filename':name,'data':data}}
 #		print(data)
 #		print(ana_dict)
 		# add the data to queue
 		add_queue(ana_q,ana_dict)
 	elif queue_type=='url':
 		# prepare queue entry
 		print('[v] Url Queue %s' % (data))
 		url_dict = {'url':data,'filename':filename}
 		sha256=create_sha256(data)
 		url_d[sha256]=url_dict
 		# add dict to queue
 		add_queue(url_q,url_dict)
 	else:
 		print('[-] Sorry, unknown queue. DEBUG!')
 		return False
 	return True
 def add_queue(tqueue, data):
-	''' wrapper function for adding easy data to
+    ''' wrapper function for adding easy data to
 		created queues. otherwise the functions will be scattered with
 		endless queue commands ;)
 	'''
-	tqueue.put(data)
+    tqueue.put(data)
-	#d=tqueue.get()
+    # d=tqueue.get()
-	#print(d)
+    #logging.debug(d)
-	return True
+    return True
-def url_strip(url):
+def process_queue_data(filename, data, queue_type):
-	url = url.rstrip("\n")
+    ''' main function for processing gathered data
-	url = url.rstrip("\r")
+		i use this central function for it, so it is at *one* place
-	return url
+		and it is easy to change the data handling at a later step without
 		deconstructing the who code
    '''
    ana_dict = {}
    url_dict = {}
-def get_random_agent():
+    if queue_type == 'doc_info':
-	return (gs.get_random_user_agent())
+        logger.info('Queue DocInfo Data {0}'.format(filename))
        name = find_name(filename)
        path = filename
        # create a hash over the file path
        # hm, removed for now
        # path_hash = create_sha256(path)
        # order data in dict for analyse queue
        ana_dict = {path: {'filename': name, 'data': data}}
        #print('data:',data)
        #print('ana_dcit:',ana_dict)
        # add the data to queue
        add_queue(ana_q, ana_dict)
    elif queue_type == 'doc_xmp_info':
        logger.info('Queue DocXMPInfo Data {0}'.format(filename))
        logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
    elif queue_type == 'url':
        # prepare queue entry
        logger.info('Url Queue {0}'.format(data))
        url_dict = {'url': data, 'filename': filename}
        sha256 = create_sha256(data)
        url_d[sha256] = url_dict
        # add dict to queue
        add_queue(url_q, url_dict)
    else:
        print('[-] Sorry, unknown queue. DEBUG!')
        logger.critical('Unknown queue')
        return False
    return True
 def get_xmp_meta_data(filename, filehandle):
    ''' get the xmp meta data
    '''
    err_dict = {}
    real_extract = {}
    xmp_dict = {}
    fh = filehandle
    try:
        xmp_meta =  fh.getXmpMetadata()
    except xml.parsers.expat.ExpatError as e:
        print('Error: %s' % e)
        err_dict = {'error': str(e)}
        return -1
    finally:
        process_queue_data(filename, err_dict, 'doc_xmp_info')
    if xmp_meta != None:
        print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
        xmp_dict = {}
    return xmp_dict
 def get_DocInfo(filename, filehandle):
-	''' the easy way to extract metadata
+    ''' the easy way to extract metadata
 		indirectObjects...
 		there is an interesting situation, some pdfs seem to have the same information stored 
@@ -152,357 +143,346 @@ def get_DocInfo(filename, filehandle):
 		bad example:
 	'''
-	err_dict = {}
+    err_dict = {}
-	real_extract = {}
+    real_extract = {}
-	fh = filehandle
+    fh = filehandle
-	try:
+    try:
-		extract = fh.documentInfo
+        extract = fh.documentInfo
-	except pdf.utils.PdfReadError as e:
+    except pdf.utils.PdfReadError as e:
-		print('Error: %s' % e)
+        print('Error: %s' % e)
-		err_dict={'error':str(e)}
+        err_dict = {'error': str(e)}
-		return -1
+        return -1
-	except PyPDF2.utils.PdfReadError as e:
+    except PyPDF2.utils.PdfReadError as e:
-		print('Error: %s' % e)
+        print('Error: %s' % e)
-		err_dict={'error':str(e)}
+        err_dict = {'error': str(e)}
-		return -1
+        return -1
-	finally:
+    finally:
-		process_queue_data(filename,err_dict,'doc_info')
+        process_queue_data(filename, err_dict, 'doc_info')
-	print('-'*80)
+    print('-' * 80)
-	print('File: %s' % filename)
+    print('File: %s' % filename)
-#	embed()
+    #	embed()
-	# there are situations when documentinfo does not return anything
+    # there are situations when documentinfo does not return anything
-	# and extract is None
+    # and extract is None
-	if extract==None:
+    if extract == None:
-		err_dict={'error':'getDocumentInfo() returns None'}
+        err_dict = {'error': 'getDocumentInfo() returns None'}
-		process_queue_data(filename,err_dict,'doc_info')
+        process_queue_data(filename, err_dict, 'doc_info')
-		return -1
+        return -1
    try:
        for k in extract.keys():
            key = str(k)
            value = str(extract[k])
            edata = '%s %s' % (key, value)
            print(edata)
            print
            real_extract[key] = value
        print('-' * 80)
-	try:
+    except PyPDF2.utils.PdfReadError as e:
-		for k in extract.keys():
+        print('Error: %s' % e)
-			key = str(k)
+        err_dict = {'error': str(e)}
-			value = str(extract[k])
+        process_queue_data(filename, err_dict, 'doc_info')
-			edata = '%s %s' % (key,value)
+        return -1
 			print(edata)
 			print
 			real_extract[key]=value
 		print('-'*80)
-	except PyPDF2.utils.PdfReadError as e:
+    process_queue_data(filename, real_extract, 'doc_info')
 		print('Error: %s' % e)
 		err_dict={'error':str(e)}
 		process_queue_data(filename,err_dict,'doc_info')
 		return -1
 	process_queue_data(filename,real_extract,'doc_info')
 def decrypt_empty_pdf(filename):
-	''' this function simply tries to decrypt the pdf with the null password
+    ''' this function simply tries to decrypt the pdf with the null password
 		this does work, as long as no real password has been set
 		if a complex password has been set -> john
 	'''
-	fr = pdf.PdfFileReader(open(filename,"rb"))
+    fr = pdf.PdfFileReader(open(filename, "rb"))
-	try:
+    try:
-		fr.decrypt('')
+        fr.decrypt('')
-	except NotImplementedError as e:
+    except NotImplementedError as e:
-		#print('Error: %s' % (e))
+        # print('Error: %s' % (e))
-		print('Error: File: %s encrypted. %s' % (filename,str(e)))
+        print('Error: File: %s encrypted. %s' % (filename, str(e)))
-		return -1
+        return -1
-	return fr
+    return fr
 def check_encryption(filename):
-	''' basic function to check if file is encrypted 
+    ''' basic function to check if file is encrypted
 	'''
-#	print(filename)
+    #	print(filename)
-	try:
+    try:
-		fr = pdf.PdfFileReader(open(filename,"rb"))
+        fr = pdf.PdfFileReader(open(filename, "rb"))
-	except pdf.utils.PdfReadError as e:
+    except pdf.utils.PdfReadError as e:
-		print('Error: %s' % e)
+        print('Error: %s' % e)
-		return -1
+        return -1
-	if fr.getIsEncrypted()==True:
+    if fr.getIsEncrypted() == True:
-		print('[i] File encrypted %s' % filename)
+        print('[i] File encrypted %s' % filename)
-		nfr = decrypt_empty_pdf(filename)
+        nfr = decrypt_empty_pdf(filename)
-		if nfr != -1:
+        if nfr != -1:
-			get_DocInfo(filename,nfr)
+            get_DocInfo(filename, nfr)
            get_xmp_meta_data(filename,nfr)
-	else:
+    else:
-		get_DocInfo(filename,fr)
+        get_DocInfo(filename, fr)
        get_xmp_meta_data(filename,fr)
-	#fr.close()
+    # fr.close()
-	return True
+    return True
 def find_name(pdf):
 	''' simply parses the urlencoded name and extracts the storage name
 		i would not be surprised this naive approach can lead to fuckups
 	'''
 	#find the name of the file
 	name = pdf.split("/")
 	a = len(name)
 	name = name[a-1]
 	#print(name)
 	return name
 def make_directory(outdir):
 	''' naive mkdir function '''
 	try:
 		os.mkdir(outdir)
 	except:
 		#print("[W] mkdir, some error, directory probably exists")
 		pass
 def download_pdf(url, args, header_data):
-	''' downloading the pdfile for later analysis '''
+    ''' downloading the pdfile for later analysis '''
-	# check the remote tls certificate or not?
+    # check the remote tls certificate or not?
-	cert_check = args.cert_check
+    cert_check = args.cert_check
-	try:
+    try:
-		req = requests.get(url,headers=header_data,verify=cert_check)
+        req = requests.get(url, headers=header_data, verify=cert_check)
-		#req = requests.get(url,headers=header_data,verify=False)
+        # req = requests.get(url,headers=header_data,verify=False)
-		data = req.content
+        data = req.content
-		status_code = req.status_code
+        status_code = req.status_code
-	except requests.exceptions.SSLError as e:
+    except requests.exceptions.SSLError as e:
-		print('Error: %s' % e)
+        print('Error: %s' % e)
-		return -1
+        return -1
-	except:
+    except:
-		print('Error: Probably something wrong with remote server')
+        print('Error: Probably something wrong with remote server')
-		return -1
+        return -1
-	if status_code == 403:
+    if status_code == 403:
-		print('%s http/403 Forbidden' % (url))
+        print('%s http/403 Forbidden' % (url))
-		return -1
+        return -1
-	#print(len(data))
+    # print(len(data))
-	return data
+    return data
 def store_pdf(url,data,outdir):
 	''' storing the downloaded pdf data 
 	'''
 	print('[v] store_pdf')
 	name = find_name(url)
-	# only allow stored file a name with 50 chars
+def store_pdf(url, data, outdir):
-	if len(name)>50:
+    ''' storing the downloaded pdf data
-		name = name[:49] + '.pdf'
+    '''
 	#print(len(name))
-	save = "%s/%s" % (outdir,name)
+    logger.info('Store pdf')
    name = find_name(url)
-	try:
+    # only allow stored file a name with 50 chars
-		f = open(save,"wb")
+    if len(name) > 50:
-	except OSError as e:
+        name = name[:49] + '.pdf'
-		print('Error: %s' % (e))
+    # print(len(name))
 		return -1
-	ret=f.write(data)
+    save = "%s/%s" % (outdir, name)
-	print('[+] Written %d bytes for File: %s' % (ret,save))
+
-	f.close()
+    try:
        f = open(save, "wb")
    except OSError as e:
        print('Error: %s' % (e))
        return -1
    ret = f.write(data)
    logger.info('Written {0} bytes for file: {1}'.format(ret,save))
    f.close()
    # return the savepath
    return save
 	# return the savepath
 	return save
 def _parse_pdf(filename):
-	''' the real parsing function '''
+    ''' the real parsing function '''
    ret = check_encryption(filename)
    return ret
 	ret = check_encryption(filename)
 	return ret
 def grab_url(url, args, outdir):
-	''' function keeping all the steps for the user call of grabbing 
+    ''' function keeping all the steps for the user call of grabbing
-		just one pdf and analysing it
+	just one pdf and analysing it
-	'''
+    '''
-	header_data={'User-Agent':get_random_agent()}
+    header_data = {'User-Agent': get_random_agent()}
-	data = download_pdf(url,args, header_data)
+    data = download_pdf(url, args, header_data)
-	if data != -1:
+    if data != -1:
-		savepath = store_pdf(url, data, outdir)
+        savepath = store_pdf(url, data, outdir)
-		_parse_pdf(savepath)
+        _parse_pdf(savepath)
-	return
+    return
 def seek_and_analyse(search,args,outdir):
 	''' function for keeping all the steps of searching for pdfs and analysing
 		them together
 	'''
 	# use the search function of googlesearch to get the results
 	search_pdf(search,args)
 	#urls = search_pdf(search,args)
-	# *if* we get an answer
+def seek_and_analyse(search, args, outdir):
-	if url_q.empty()==False:
+    ''' function for keeping all the steps of searching for pdfs and analysing
-	#if urls != -1:
+        them together
-		# process through the list and get the pdfs
+    '''
-		while url_q.empty()==False:
+    # use the search function of googlesearch to get the results
-			item=url_q.get()
+    urls=search_pdf(search, args)
-			#print(item)
+    for item in urls:
-			url = item['url']
+        filename = find_name(item)
-			grab_url(url,args,outdir)
+        process_queue_data(filename, item, 'url')
-def search_pdf(search, args):
+    # urls = search_pdf(search,args)
 	''' the function where googlesearch from mario vilas
 		is called
 	'''
-	search_stop = args.search_stop
+    # *if* we get an answer
    if url_q.empty() == False:
        # if urls != -1:
        # process through the list and get the pdfs
        while url_q.empty() == False:
            item = url_q.get()
            # print(item)
            url = item['url']
            grab_url(url, args, outdir)
 	query='%s filetype:pdf' % search
 	#print(query)
 	urls = []
 	try:
 		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
 			#print(url)
 			# parse out the name of the file in the url	
 			filename=find_name(url)
 			# add the file to queue
 			process_queue_data(filename,url,'url')
 			urls.append(url)
 	except urllib.error.HTTPError as e:
 		print('Error: %s' % e)
 		return -1
 	#return urls
 def run(args):
-	# outfile name
+    # initialize logger
-	if args.outfile:
+    logger.info('{0} Started'.format(name))
 		out_filename = args.outfile
 	else:
 		out_filename = 'pdfgrab_analysis'
-	# specify output directory
+    # outfile name
-	outdir = args.outdir
+    if args.outfile:
        out_filename = args.outfile
    else:
        out_filename = 'pdfgrab_analysis'
-	# create output directory
+    # specify output directory
-	make_directory(outdir)
+    outdir = args.outdir
-	# lets see what the object is
+    # create output directory
-	if args.url_single:
+    make_directory(outdir)
 		url = args.url_single
 		print('[+] Grabbing %s' % (url))
 		grab_url(url, args,outdir)
-	elif args.file_single:
+    # lets see what the object is
-		pdffile = args.file_single
+    if args.url_single:
-		print('[+] Parsing %s' % (pdffile))
+        url = args.url_single
-		_parse_pdf(pdffile)
+        logger.info('Grabbing {0}'.format(url))
        logger.write_to_log('Grabbing %s' % (url))
        grab_url(url, args, outdir)
-	elif args.search:
+    elif args.file_single:
-		search = args.search
+        pdffile = args.file_single
-		#print(args)
+        logger.info('Parsing {0}'.format(pdffile))
-		print('[+] Seek and de...erm...analysing %s' % (search))
+        _parse_pdf(pdffile)
 		seek_and_analyse(search,args,outdir)
-	elif args.files_dir:
+    elif args.search:
-		directory = args.files_dir
+        search = args.search
-		print('[+] Analyse pdfs in directory %s' % (directory))
+        logger.info('Seek and analyse {0}'.format(search))
-		try:
+        seek_and_analyse(search, args, outdir)
 			files = os.listdir(directory)
 		except:
 			print('Error')
 			return False
-		for f in files:
+    elif args.files_dir:
-			# naive filter function, later usage of filemagic possible
+        directory = args.files_dir
-			if f.find('.pdf')!=-1:
+        logger.info('Analyse pdfs in directory {0}'.format(directory))
-				fpath = '%s/%s' % (directory,f)
+        try:
-				_parse_pdf(fpath)
+            files = os.listdir(directory)
        except:
            logger.warning('Error in args.files_dir')
            return False
-	else:
+        for f in files:
-		print('[-] Dunno what to do, bro.')
+            # naive filter function, later usage of filemagic possible
            if f.find('.pdf') != -1:
                fpath = '%s/%s' % (directory, f)
                _parse_pdf(fpath)
-	# move analysis dictionary in queue back to dictionary
+    else:
-	analysis_dict = {}
+        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
 	while ana_q.empty()==False:
 		item = ana_q.get()
 		#print('item ', item)
 		analysis_dict.update(item)
-	# ana_q is empty now
+    # move analysis dictionary in queue back to dictionary
    analysis_dict = {}
    while ana_q.empty() == False:
        item = ana_q.get()
        # print('item ', item)
        analysis_dict.update(item)
-	# create txt output
+    #print('dict:',analysis_dict)
-	sep = '-'*80 + '\n'
+    # ana_q is empty now
 	txtout = "%s/%s.txt" % (outdir,out_filename)
 	fwtxt = open(txtout,'w')
 	#print(analysis_dict)
 	for k in analysis_dict.keys():
 		fwtxt.write(sep)
 		fname = 'File: %s\n' % (analysis_dict[k]['filename'])
 		ddata = analysis_dict[k]['data']
 		fwtxt.write(fname)
 		for kdata in ddata.keys():
 			metatxt = '%s:%s\n' % (kdata,ddata[kdata])
 			fwtxt.write(metatxt)
 		fwtxt.write(sep)
 	fwtxt.close()
-	# create json output
+    # create txt output
-	jsonout = "%s/%s.json" % (outdir,out_filename)
+    sep = '-' * 80 + '\n'
-	fwjson = open(jsonout,'w')
+    txtout = "%s/%s.txt" % (outdir, out_filename)
-	#for k in analysis_dict.keys():
+    fwtxt = open(txtout, 'w')
-		#print(analysis_dict[k])
+    # print(analysis_dict)
-	#	jdata = json.dumps(analysis_dict[k])
+    for k in analysis_dict.keys():
        fwtxt.write(sep)
        fname = 'File: %s\n' % (analysis_dict[k]['filename'])
        ddata = analysis_dict[k]['data']
        fwtxt.write(fname)
        for kdata in ddata.keys():
            metatxt = '%s:%s\n' % (kdata, ddata[kdata])
            fwtxt.write(metatxt)
        fwtxt.write(sep)
    fwtxt.close()
-	#print(analysis_dict)
+    # create json output
-	jdata = json.dumps(analysis_dict)
+    jsonout = "%s/%s.json" % (outdir, out_filename)
-	fwjson.write(jdata)
+    fwjson = open(jsonout, 'w')
 	fwjson.close()
-	# create url savefile
+    # print(analysis_dict)
-	#print('url_d: ', url_d)
+    jdata = json.dumps(analysis_dict)
-	jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
+    fwjson.write(jdata)
-	fwjson = open(jsonurlout,'w')
+    fwjson.close()
-	jdata = json.dumps(url_d)
+
-	fwjson.write(jdata)
+    # create html from json
-	fwjson.close()
+    htmlout = "%s/%s.html" % (outdir, out_filename)
    fwhtml = open(htmlout,'w')
    #print(jdata)
    html = json2html.convert(json = jdata)
    fwhtml.write(html)
    fwhtml.close()
-	txtout = "%s/%s_url.txt" % (outdir,out_filename)
+    # create url savefile
-	fwtxt = open(txtout,'w')
+    # print('url_d: ', url_d)
-	for k in url_d.keys():
+    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
-		ddata = url_d[k]
+    fwjson = open(jsonurlout, 'w')
-		metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
+    jdata = json.dumps(url_d)
-		fwtxt.write(metatxt)
+    fwjson.write(jdata)
-	fwtxt.close()
+    fwjson.close()
-	return 42
+    txtout = "%s/%s_url.txt" % (outdir, out_filename)
-	# This is the end my friend.
+    fwtxt = open(txtout, 'w')
    for k in url_d.keys():
        ddata = url_d[k]
        metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
        fwtxt.write(metatxt)
    fwtxt.close()
    return 42
 # This is the end my friend.
 def main():
-	parser_desc = "%s %s %s in %s" % (name,version,author,date)
+    parser_desc = "%s %s %s in %s" % (name, version, author, date)
-	parser = argparse.ArgumentParser(prog = name, description=parser_desc)
+    parser = argparse.ArgumentParser(prog=name, description=parser_desc)
-	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
+    parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
-	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
+                        help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
-	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
+    parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
-	#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
+                        help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
-#########
+    parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
-	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
+                        help="grab pdf from specified url for analysis", default=None)
-	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
+    # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
-	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
+    #########
-	parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
+    parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
-	parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
+                        help="specify local path of pdf for analysis", default=None)
    parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
                        help="specify local path of *directory* with pdf *files* for analysis", default=None)
    parser.add_argument('-s', '--search', action='store', dest='search', required=False,
                        help="specify domain or tld to scrape for pdf-files", default=None)
    parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
                        help="specify how many files are searched", default=10, type=int)
    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
                        help="if the target domain(s) run with old or bad certificates", default=True)
    if len(sys.argv)<2:
        parser.print_help(sys.stderr)
        sys.exit()
    args = parser.parse_args()
    run(args)
 	args = parser.parse_args()
 	run(args)
 if __name__ == "__main__":
-	main()
+    main()