several changes,new features,bugfixes

2019-10-02 18:17:41 +02:00
parent 64f48eef9a
commit 5f9cdf86d1
2 changed files with 287 additions and 31 deletions
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -7,41 +7,127 @@
 # by dash at the end of September 2019
 # 
 # TODO
-# * json file output
-# * txt file output
-# * complete analyse.txt and seperated
+# * add complete path in output as well as url where pdf came from
+# -> if url not exist like -F mode, then the local path
 # * clean up code
-# * do more testing
 # * fine tune google search
 # * add random timeout for new requests
 # -> maybe not necessary, gs has it ...
 # -> sort of necessary, on the other hand use proxychains man
 # * uh oh some fancy c0l0rs
-# * catch filename to long thingy
+# * add thread support
+# * add scrape mode, to search for pdfs at the website itself
+# * add current error conditions to logfile
 #
 # Done
+# * add url list to output
+# * queues added, but no thread support yet
+# * json file output
+# * txt file output
+# * outfilename hardcoded
 # * add decryption routine
 # * catch ssl exceptions
 # * add random useragent for google and website pdf gathering
 # * set option for certificate verification, default is true
 # * catch conn refused connections
+# * catch filename to long thingy

 import os
 import sys
+import json
+import queue
+import urllib
 import argparse
 import requests
-import urllib

+# remove somewhen ;)
 from IPython import embed

 from PyPDF2 import pdf
+import PyPDF2
+from Crypto.Hash import SHA256
+from collections import deque
+
+# googlesearch library
 import googlesearch as gs

+# some variables in regard of the tool itself
 name 		= 'pdfgrab'
-version 	= '0.4'
+version 	= '0.4.4'
 author		= 'dash'
 date		= '2019'

+# queues for processing
+# this queue holds the URL locations of files to download
+url_q = queue.Queue()
+url_d = {}
+
+# this queue holds the paths of files to analyse
+pdf_q = queue.Queue()
+
+# this is the analysis queue, keeping the data for further processing
+ana_q = queue.Queue()
+
+def create_sha256(hdata):
+	''' introduced to create hashes of filenames, to have a uniqid
+		of course hashes of the file itself will be the next topic
+	'''
+	hobject = SHA256.new(data=hdata.encode())
+	return (hobject.hexdigest())
+
+def process_queue_data(filename,data,queue_type):
+	''' main function for processing gathered data
+		i use this central function for it, so it is at *one* place
+		and it is easy to change the data handling at a later step without
+		deconstructing the who code
+	'''
+	ana_dict = {}
+	url_dict = {}
+
+	if queue_type=='doc_info':
+		print('[v] Queue DocInfo Data %s' % (filename))
+		name = find_name(filename)
+		path = filename
+
+		# create a hash over the file path
+		# hm, removed for now
+		#path_hash = create_sha256(path)
+
+		# order data in dict for analyse queue
+		ana_dict = {path : {'filename':name,'data':data}}
+#		print(data)
+#		print(ana_dict)
+
+		# add the data to queue
+		add_queue(ana_q,ana_dict)
+
+	elif queue_type=='url':
+		# prepare queue entry
+		print('[v] Url Queue %s' % (data))
+		url_dict = {'url':data,'filename':filename}
+		sha256=create_sha256(data)
+		url_d[sha256]=url_dict
+
+		# add dict to queue
+		add_queue(url_q,url_dict)
+
+	else:
+		print('[-] Sorry, unknown queue. DEBUG!')
+		return False
+	
+	return True
+
+def add_queue(tqueue, data):
+	''' wrapper function for adding easy data to
+		created queues. otherwise the functions will be scattered with
+		endless queue commands ;)
+	'''
+
+	tqueue.put(data)
+	#d=tqueue.get()
+	#print(d)
+	return True
+
 def url_strip(url):
 	url = url.rstrip("\n")
 	url = url.rstrip("\r")
@@ -52,32 +138,85 @@ def get_random_agent():

 def get_DocInfo(filename, filehandle):
 	''' the easy way to extract metadata
+		
+		indirectObjects...
+		there is an interesting situation, some pdfs seem to have the same information stored 
+		in different places, or things are overwritten or whatever
+		this sometimes results in an extract output with indirect objects ... this is ugly
+
+		{'/Title': IndirectObject(111, 0), '/Producer': IndirectObject(112, 0), '/Creator': IndirectObject(113, 0), '/CreationDate': IndirectObject(114, 0), '/ModDate': IndirectObject(114, 0), '/Keywords': IndirectObject(115, 0), '/AAPL:Keywords': IndirectObject(116, 0)}
+
+		normally getObject() is the method to use, to fix this, however this was not working in the particular case.
+		this thing might even bring up some more nasty things, as a (probably weak) defense and workaround
+		the pdfobject is not used anymore after this function, data is converted to strings...
+		bad example:
 	'''

+	err_dict = {}
+	real_extract = {}
+
 	fh = filehandle
+
 	try:
 		extract = fh.documentInfo
+
 	except pdf.utils.PdfReadError as e:
 		print('Error: %s' % e)
+		err_dict={'error':str(e)}
 		return -1

+	except PyPDF2.utils.PdfReadError as e:
+		print('Error: %s' % e)
+		err_dict={'error':str(e)}
+		return -1
+
+	finally:
+		process_queue_data(filename,err_dict,'doc_info')
+
 	print('-'*80)
 	print('File: %s' % filename)
-	for k in extract.keys():
-		edata = '%s %s' % (k,extract[k])
-		print(edata)
-		print
-	print('-'*80)
+#	embed()
+	# there are situations when documentinfo does not return anything
+	# and extract is None
+	if extract==None:
+		err_dict={'error':'getDocumentInfo() returns None'}
+		process_queue_data(filename,err_dict,'doc_info')
+		return -1
+
+
+	try:
+		for k in extract.keys():
+			key = str(k)
+			value = str(extract[k])
+			edata = '%s %s' % (key,value)
+			print(edata)
+			print
+			real_extract[key]=value
+		print('-'*80)
+
+	except PyPDF2.utils.PdfReadError as e:
+		print('Error: %s' % e)
+		err_dict={'error':str(e)}
+		process_queue_data(filename,err_dict,'doc_info')
+		return -1
+
+
+	process_queue_data(filename,real_extract,'doc_info')


 def decrypt_empty_pdf(filename):
+	''' this function simply tries to decrypt the pdf with the null password
+		this does work, as long as no real password has been set
+		if a complex password has been set -> john
+	'''

 	fr = pdf.PdfFileReader(open(filename,"rb"))
 	try:
 		fr.decrypt('')
+
 	except NotImplementedError as e:
-		print('Error: %s' % (e))
-		print('Only algorithm code 1 and 2 are supported')
+		#print('Error: %s' % (e))
+		print('Error: File: %s encrypted. %s' % (filename,str(e)))
 		return -1
 	return fr
 	
@@ -86,7 +225,7 @@ def check_encryption(filename):
 	''' basic function to check if file is encrypted 
 	'''

-	print(filename)
+#	print(filename)
 	try:
 		fr = pdf.PdfFileReader(open(filename,"rb"))
 	except pdf.utils.PdfReadError as e:
@@ -137,25 +276,36 @@ def download_pdf(url, args, header_data):
 		req = requests.get(url,headers=header_data,verify=cert_check)
 		#req = requests.get(url,headers=header_data,verify=False)
 		data = req.content
+		status_code = req.status_code
+
 	except requests.exceptions.SSLError as e:
 		print('Error: %s' % e)
 		return -1
+
 	except:
 		print('Error: Probably something wrong with remote server')
 		return -1

+	if status_code == 403:
+		print('%s http/403 Forbidden' % (url))
+		return -1
+
 	#print(len(data))
 	return data

 def store_pdf(url,data,outdir):
 	''' storing the downloaded pdf data 
 	'''
+	print('[v] store_pdf')
 	name = find_name(url)

 	# only allow stored file a name with 50 chars
-	name = name[:49] + '.pdf'
-	print(len(name))
+	if len(name)>50:
+		name = name[:49] + '.pdf'
+	#print(len(name))
+
 	save = "%s/%s" % (outdir,name)
+
 	try:
 		f = open(save,"wb")
 	except OSError as e:
@@ -192,13 +342,18 @@ def seek_and_analyse(search,args,outdir):
 		them together
 	'''
 	# use the search function of googlesearch to get the results
-	urls = search_pdf(search,args)
+	search_pdf(search,args)
+	#urls = search_pdf(search,args)


 	# *if* we get an answer
-	if urls != -1:
+	if url_q.empty()==False:
+	#if urls != -1:
 		# process through the list and get the pdfs
-		for url in urls:
+		while url_q.empty()==False:
+			item=url_q.get()
+			#print(item)
+			url = item['url']
 			grab_url(url,args,outdir)

 def search_pdf(search, args):
@@ -214,16 +369,26 @@ def search_pdf(search, args):

 	try:
 		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
-			print(url)
+			#print(url)
+			# parse out the name of the file in the url	
+			filename=find_name(url)
+			# add the file to queue
+			process_queue_data(filename,url,'url')
 			urls.append(url)
 	
 	except urllib.error.HTTPError as e:
 		print('Error: %s' % e)
 		return -1
-	return urls
+	#return urls

 def run(args):

+	# outfile name
+	if args.outfile:
+		out_filename = args.outfile
+	else:
+		out_filename = 'pdfgrab_analysis'
+
 	# specify output directory
 	outdir = args.outdir

@@ -250,14 +415,74 @@ def run(args):
 	elif args.files_dir:
 		directory = args.files_dir
 		print('[+] Analyse pdfs in directory %s' % (directory))
-		files = os.listdir(directory)
+		try:
+			files = os.listdir(directory)
+		except:
+			print('Error')
+			return False
+
 		for f in files:
-			fpath = '%s/%s' % (directory,f)
-			_parse_pdf(fpath)
+			# naive filter function, later usage of filemagic possible
+			if f.find('.pdf')!=-1:
+				fpath = '%s/%s' % (directory,f)
+				_parse_pdf(fpath)

 	else:
 		print('[-] Dunno what to do, bro.')
+	
+	# move analysis dictionary in queue back to dictionary
+	analysis_dict = {}
+	while ana_q.empty()==False:
+		item = ana_q.get()
+		#print('item ', item)
+		analysis_dict.update(item)
+	
+	# ana_q is empty now

+	# create txt output
+	sep = '-'*80 + '\n'
+	txtout = "%s/%s.txt" % (outdir,out_filename)
+	fwtxt = open(txtout,'w')
+	#print(analysis_dict)
+	for k in analysis_dict.keys():
+		fwtxt.write(sep)
+		fname = 'File: %s\n' % (analysis_dict[k]['filename'])
+		ddata = analysis_dict[k]['data']
+		fwtxt.write(fname)
+		for kdata in ddata.keys():
+			metatxt = '%s:%s\n' % (kdata,ddata[kdata])
+			fwtxt.write(metatxt)
+		fwtxt.write(sep)
+	fwtxt.close()
+	
+	# create json output
+	jsonout = "%s/%s.json" % (outdir,out_filename)
+	fwjson = open(jsonout,'w')
+	#for k in analysis_dict.keys():
+		#print(analysis_dict[k])
+	#	jdata = json.dumps(analysis_dict[k])
+
+	#print(analysis_dict)
+	jdata = json.dumps(analysis_dict)
+	fwjson.write(jdata)
+	fwjson.close()
+
+	# create url savefile
+	#print('url_d: ', url_d)
+	jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
+	fwjson = open(jsonurlout,'w')
+	jdata = json.dumps(url_d)
+	fwjson.write(jdata)
+	fwjson.close()
+
+
+	txtout = "%s/%s_url.txt" % (outdir,out_filename)
+	fwtxt = open(txtout,'w')
+	for k in url_d.keys():
+		ddata = url_d[k]
+		metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
+		fwtxt.write(metatxt)
+	fwtxt.close()

 	return 42
 	# This is the end my friend.
@@ -266,7 +491,7 @@ def main():
 	parser_desc = "%s %s %s in %s" % (name,version,author,date)
 	parser = argparse.ArgumentParser(prog = name, description=parser_desc)
 	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
-#	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
+	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
 	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
 	#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
 #########