several changes,new features,bugfixes

2019-10-02 18:17:41 +02:00
parent 64f48eef9a
commit 5f9cdf86d1
2 changed files with 287 additions and 31 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -1,5 +1,7 @@
 # pdfgrab

+* Version 0.4.4
+
 ## What is it?

 This is a reborn tool, used during the epoche dinosaurs were traipsing the earth. 
@@ -22,13 +24,37 @@ However, common are the following things:

 and some more :)

+## What is this for anyways?
+
+Well, this can be used for a range of things. However, i will only focus on the 
+security part of it. Depending on your target you will get information about:
+
+* used software in company xyz
+	* possible version numbers
+		* this will help you to identify existing vulnerabilities
+	* sometimes pdfs are rendered new, for instance on uploads
+		* now you can figure what the rendering engine is and find bugs in it
+* who is the author of documents
+	* sometimes usernames are users of the OS itself
+		* congrats you just found by analysing a pdf an existing username in the domain
+		* combine the information with the first part, you know which user uses which software
+* passwords ... do i need to say more?
+
+## Is it failproof?
+
+Not at all. Please note that metadata as every other data is just written to that file. So i can be changed before it is uploaded. Said that, the amount of companies really changing that sort of data is maybe at 20%. Also you will recognize if it is empty or alike.
+
 ## How does it work?

 Every more complex filetype above .txt or alike uses metadata for convinience, customer support or only to spread it has been used.
 There is a lot information about metadata in different sort of files like pictures, documents, videos, music online. This tool
 focuses on pdf only. 
 If you are new to that term have a look here:
-https://en.wikipedia.org/wiki/Metadata
+* https://en.wikipedia.org/wiki/Metadata
+
+Also, if you are interested in a real pdf analysis, this tool will only do the basics for you. It has not been written to analyse bad, malware or even interesting files. It's purpose is to give you an idea what is used at target xyz. 
+If you are looking for more in-depth analysis i recommend the tools of Didier Stevens:
+* https://blog.didierstevens.com/programs/pdf-tools/

 ## Download

@@ -123,13 +149,18 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
 ```

 ## TODO
-* json file-output
-* txt file-output
+* ~~fixed some bugs with *uncommon* pdfs~~
+* add socks proxy
+* ~~add queues~~ for threading
+* ~~add url list to output~~
+* ~~json file-output~~
+* ~~txt file-output~~
 * catch conn refused connections
 * ~~set option for certificate verification, default is true~~
-* complete analyse.txt and seperated
+* ~~complete analyse.txt~~
 * clean up code
-* do more testing
+* ~~do more testing~~
+	* do even more testing
 * ~~add random useragent for google and website pdf gathering~~
 * ~~add decryption routine~~
 * ~~catch ssl exceptions~~
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -7,41 +7,127 @@
 # by dash at the end of September 2019
 # 
 # TODO
-# * json file output
-# * txt file output
-# * complete analyse.txt and seperated
+# * add complete path in output as well as url where pdf came from
+# -> if url not exist like -F mode, then the local path
 # * clean up code
-# * do more testing
 # * fine tune google search
 # * add random timeout for new requests
 # -> maybe not necessary, gs has it ...
 # -> sort of necessary, on the other hand use proxychains man
 # * uh oh some fancy c0l0rs
-# * catch filename to long thingy
+# * add thread support
+# * add scrape mode, to search for pdfs at the website itself
+# * add current error conditions to logfile
 #
 # Done
+# * add url list to output
+# * queues added, but no thread support yet
+# * json file output
+# * txt file output
+# * outfilename hardcoded
 # * add decryption routine
 # * catch ssl exceptions
 # * add random useragent for google and website pdf gathering
 # * set option for certificate verification, default is true
 # * catch conn refused connections
+# * catch filename to long thingy

 import os
 import sys
+import json
+import queue
+import urllib
 import argparse
 import requests
-import urllib

+# remove somewhen ;)
 from IPython import embed

 from PyPDF2 import pdf
+import PyPDF2
+from Crypto.Hash import SHA256
+from collections import deque
+
+# googlesearch library
 import googlesearch as gs

+# some variables in regard of the tool itself
 name 		= 'pdfgrab'
-version 	= '0.4'
+version 	= '0.4.4'
 author		= 'dash'
 date		= '2019'

+# queues for processing
+# this queue holds the URL locations of files to download
+url_q = queue.Queue()
+url_d = {}
+
+# this queue holds the paths of files to analyse
+pdf_q = queue.Queue()
+
+# this is the analysis queue, keeping the data for further processing
+ana_q = queue.Queue()
+
+def create_sha256(hdata):
+	''' introduced to create hashes of filenames, to have a uniqid
+		of course hashes of the file itself will be the next topic
+	'''
+	hobject = SHA256.new(data=hdata.encode())
+	return (hobject.hexdigest())
+
+def process_queue_data(filename,data,queue_type):
+	''' main function for processing gathered data
+		i use this central function for it, so it is at *one* place
+		and it is easy to change the data handling at a later step without
+		deconstructing the who code
+	'''
+	ana_dict = {}
+	url_dict = {}
+
+	if queue_type=='doc_info':
+		print('[v] Queue DocInfo Data %s' % (filename))
+		name = find_name(filename)
+		path = filename
+
+		# create a hash over the file path
+		# hm, removed for now
+		#path_hash = create_sha256(path)
+
+		# order data in dict for analyse queue
+		ana_dict = {path : {'filename':name,'data':data}}
+#		print(data)
+#		print(ana_dict)
+
+		# add the data to queue
+		add_queue(ana_q,ana_dict)
+
+	elif queue_type=='url':
+		# prepare queue entry
+		print('[v] Url Queue %s' % (data))
+		url_dict = {'url':data,'filename':filename}
+		sha256=create_sha256(data)
+		url_d[sha256]=url_dict
+
+		# add dict to queue
+		add_queue(url_q,url_dict)
+
+	else:
+		print('[-] Sorry, unknown queue. DEBUG!')
+		return False
+	
+	return True
+
+def add_queue(tqueue, data):
+	''' wrapper function for adding easy data to
+		created queues. otherwise the functions will be scattered with
+		endless queue commands ;)
+	'''
+
+	tqueue.put(data)
+	#d=tqueue.get()
+	#print(d)
+	return True
+
 def url_strip(url):
 	url = url.rstrip("\n")
 	url = url.rstrip("\r")
@@ -52,32 +138,85 @@ def get_random_agent():

 def get_DocInfo(filename, filehandle):
 	''' the easy way to extract metadata
+		
+		indirectObjects...
+		there is an interesting situation, some pdfs seem to have the same information stored 
+		in different places, or things are overwritten or whatever
+		this sometimes results in an extract output with indirect objects ... this is ugly
+
+		{'/Title': IndirectObject(111, 0), '/Producer': IndirectObject(112, 0), '/Creator': IndirectObject(113, 0), '/CreationDate': IndirectObject(114, 0), '/ModDate': IndirectObject(114, 0), '/Keywords': IndirectObject(115, 0), '/AAPL:Keywords': IndirectObject(116, 0)}
+
+		normally getObject() is the method to use, to fix this, however this was not working in the particular case.
+		this thing might even bring up some more nasty things, as a (probably weak) defense and workaround
+		the pdfobject is not used anymore after this function, data is converted to strings...
+		bad example:
 	'''

+	err_dict = {}
+	real_extract = {}
+
 	fh = filehandle
+
 	try:
 		extract = fh.documentInfo
+
 	except pdf.utils.PdfReadError as e:
 		print('Error: %s' % e)
+		err_dict={'error':str(e)}
 		return -1

+	except PyPDF2.utils.PdfReadError as e:
+		print('Error: %s' % e)
+		err_dict={'error':str(e)}
+		return -1
+
+	finally:
+		process_queue_data(filename,err_dict,'doc_info')
+
 	print('-'*80)
 	print('File: %s' % filename)
+#	embed()
+	# there are situations when documentinfo does not return anything
+	# and extract is None
+	if extract==None:
+		err_dict={'error':'getDocumentInfo() returns None'}
+		process_queue_data(filename,err_dict,'doc_info')
+		return -1
+
+
+	try:
 		for k in extract.keys():
-		edata = '%s %s' % (k,extract[k])
+			key = str(k)
+			value = str(extract[k])
+			edata = '%s %s' % (key,value)
 			print(edata)
 			print
+			real_extract[key]=value
 		print('-'*80)

+	except PyPDF2.utils.PdfReadError as e:
+		print('Error: %s' % e)
+		err_dict={'error':str(e)}
+		process_queue_data(filename,err_dict,'doc_info')
+		return -1
+
+
+	process_queue_data(filename,real_extract,'doc_info')
+

 def decrypt_empty_pdf(filename):
+	''' this function simply tries to decrypt the pdf with the null password
+		this does work, as long as no real password has been set
+		if a complex password has been set -> john
+	'''

 	fr = pdf.PdfFileReader(open(filename,"rb"))
 	try:
 		fr.decrypt('')
+
 	except NotImplementedError as e:
-		print('Error: %s' % (e))
-		print('Only algorithm code 1 and 2 are supported')
+		#print('Error: %s' % (e))
+		print('Error: File: %s encrypted. %s' % (filename,str(e)))
 		return -1
 	return fr
 	
@@ -86,7 +225,7 @@ def check_encryption(filename):
 	''' basic function to check if file is encrypted 
 	'''

-	print(filename)
+#	print(filename)
 	try:
 		fr = pdf.PdfFileReader(open(filename,"rb"))
 	except pdf.utils.PdfReadError as e:
@@ -137,25 +276,36 @@ def download_pdf(url, args, header_data):
 		req = requests.get(url,headers=header_data,verify=cert_check)
 		#req = requests.get(url,headers=header_data,verify=False)
 		data = req.content
+		status_code = req.status_code
+
 	except requests.exceptions.SSLError as e:
 		print('Error: %s' % e)
 		return -1
+
 	except:
 		print('Error: Probably something wrong with remote server')
 		return -1

+	if status_code == 403:
+		print('%s http/403 Forbidden' % (url))
+		return -1
+
 	#print(len(data))
 	return data

 def store_pdf(url,data,outdir):
 	''' storing the downloaded pdf data 
 	'''
+	print('[v] store_pdf')
 	name = find_name(url)

 	# only allow stored file a name with 50 chars
+	if len(name)>50:
 		name = name[:49] + '.pdf'
-	print(len(name))
+	#print(len(name))
+
 	save = "%s/%s" % (outdir,name)
+
 	try:
 		f = open(save,"wb")
 	except OSError as e:
@@ -192,13 +342,18 @@ def seek_and_analyse(search,args,outdir):
 		them together
 	'''
 	# use the search function of googlesearch to get the results
-	urls = search_pdf(search,args)
+	search_pdf(search,args)
+	#urls = search_pdf(search,args)


 	# *if* we get an answer
-	if urls != -1:
+	if url_q.empty()==False:
+	#if urls != -1:
 		# process through the list and get the pdfs
-		for url in urls:
+		while url_q.empty()==False:
+			item=url_q.get()
+			#print(item)
+			url = item['url']
 			grab_url(url,args,outdir)

 def search_pdf(search, args):
@@ -214,16 +369,26 @@ def search_pdf(search, args):

 	try:
 		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
-			print(url)
+			#print(url)
+			# parse out the name of the file in the url	
+			filename=find_name(url)
+			# add the file to queue
+			process_queue_data(filename,url,'url')
 			urls.append(url)
 	
 	except urllib.error.HTTPError as e:
 		print('Error: %s' % e)
 		return -1
-	return urls
+	#return urls

 def run(args):

+	# outfile name
+	if args.outfile:
+		out_filename = args.outfile
+	else:
+		out_filename = 'pdfgrab_analysis'
+
 	# specify output directory
 	outdir = args.outdir

@@ -250,14 +415,74 @@ def run(args):
 	elif args.files_dir:
 		directory = args.files_dir
 		print('[+] Analyse pdfs in directory %s' % (directory))
+		try:
 			files = os.listdir(directory)
+		except:
+			print('Error')
+			return False
+
 		for f in files:
+			# naive filter function, later usage of filemagic possible
+			if f.find('.pdf')!=-1:
 				fpath = '%s/%s' % (directory,f)
 				_parse_pdf(fpath)

 	else:
 		print('[-] Dunno what to do, bro.')
 	
+	# move analysis dictionary in queue back to dictionary
+	analysis_dict = {}
+	while ana_q.empty()==False:
+		item = ana_q.get()
+		#print('item ', item)
+		analysis_dict.update(item)
+	
+	# ana_q is empty now
+
+	# create txt output
+	sep = '-'*80 + '\n'
+	txtout = "%s/%s.txt" % (outdir,out_filename)
+	fwtxt = open(txtout,'w')
+	#print(analysis_dict)
+	for k in analysis_dict.keys():
+		fwtxt.write(sep)
+		fname = 'File: %s\n' % (analysis_dict[k]['filename'])
+		ddata = analysis_dict[k]['data']
+		fwtxt.write(fname)
+		for kdata in ddata.keys():
+			metatxt = '%s:%s\n' % (kdata,ddata[kdata])
+			fwtxt.write(metatxt)
+		fwtxt.write(sep)
+	fwtxt.close()
+	
+	# create json output
+	jsonout = "%s/%s.json" % (outdir,out_filename)
+	fwjson = open(jsonout,'w')
+	#for k in analysis_dict.keys():
+		#print(analysis_dict[k])
+	#	jdata = json.dumps(analysis_dict[k])
+
+	#print(analysis_dict)
+	jdata = json.dumps(analysis_dict)
+	fwjson.write(jdata)
+	fwjson.close()
+
+	# create url savefile
+	#print('url_d: ', url_d)
+	jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
+	fwjson = open(jsonurlout,'w')
+	jdata = json.dumps(url_d)
+	fwjson.write(jdata)
+	fwjson.close()
+
+
+	txtout = "%s/%s_url.txt" % (outdir,out_filename)
+	fwtxt = open(txtout,'w')
+	for k in url_d.keys():
+		ddata = url_d[k]
+		metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
+		fwtxt.write(metatxt)
+	fwtxt.close()

 	return 42
 	# This is the end my friend.
@@ -266,7 +491,7 @@ def main():
 	parser_desc = "%s %s %s in %s" % (name,version,author,date)
 	parser = argparse.ArgumentParser(prog = name, description=parser_desc)
 	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
-#	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
+	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
 	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
 	#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
 #########