cert-check, random-user-agent, catching too many requests

2019-09-26 18:45:02 +02:00
parent fb2dfb1527
commit 64f48eef9a
2 changed files with 76 additions and 57 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -123,14 +123,14 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
 ```
 ## TODO
-* json output
+* json file-output
-* txt output
+* txt file-output
 * catch conn refused connections
-* set option for certificate verification, default is false
+* ~~set option for certificate verification, default is true~~
 * complete analyse.txt and seperated
 * clean up code
 * do more testing
-* add random useragent for google and website pdf gathering
+* ~~add random useragent for google and website pdf gathering~~
 * ~~add decryption routine~~
 * ~~catch ssl exceptions~~
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -4,45 +4,55 @@
 # and not some self crafted f00
 #
 # new features, new layout, new new :>
-# dash in end of September 2019
+# by dash at the end of September 2019
 # 
 # 
 # TODO
-# * json output
+# * json file output
-# * txt output
+# * txt file output
 # * catch conn refused connections
 # * set option for certificate verification, default is false
 # * complete analyse.txt and seperated
 # * clean up code
 # * do more testing
-# * add random useragent for google and website pdf gathering
+# * fine tune google search
 # * add random timeout for new requests
 # -> maybe not necessary, gs has it ...
 # -> sort of necessary, on the other hand use proxychains man
 # * uh oh some fancy c0l0rs
 # * catch filename to long thingy
 #
 # Done
 # * add decryption routine
 # * catch ssl exceptions
 # * add random useragent for google and website pdf gathering
 # * set option for certificate verification, default is true
 # * catch conn refused connections
 import os
 import sys
 import argparse
 import requests
 import urllib
 from IPython import embed
 from PyPDF2 import pdf
 import googlesearch as gs
-_name_ 		= 'pdfgrab'
+name 		= 'pdfgrab'
-_version_ 	= '0.3'
+version 	= '0.4'
-_author_	= 'dash'
+author		= 'dash'
-_date_		= '2019'
+date		= '2019'
 def url_strip(url):
 	url = url.rstrip("\n")
 	url = url.rstrip("\r")
 	return url
 def get_random_agent():
 	return (gs.get_random_user_agent())
 def get_DocInfo(filename, filehandle):
 	''' the easy way to extract metadata
 	'''
 	fh = filehandle
 	try:
@@ -117,10 +127,14 @@ def make_directory(outdir):
 		#print("[W] mkdir, some error, directory probably exists")
 		pass
-def download_pdf(url, header_data):
+def download_pdf(url, args, header_data):
 	''' downloading the pdfile for later analysis '''
 	# check the remote tls certificate or not?
 	cert_check = args.cert_check
 	try:
-		req = requests.get(url,headers=header_data,verify=True)
+		req = requests.get(url,headers=header_data,verify=cert_check)
 		#req = requests.get(url,headers=header_data,verify=False)
 		data = req.content
 	except requests.exceptions.SSLError as e:
@@ -134,8 +148,13 @@ def download_pdf(url, header_data):
 	return data
 def store_pdf(url,data,outdir):
-	''' storing the downloaded pdf data '''
+	''' storing the downloaded pdf data 
 	'''
 	name = find_name(url)
 	# only allow stored file a name with 50 chars
 	name = name[:49] + '.pdf'
 	print(len(name))
 	save = "%s/%s" % (outdir,name)
 	try:
 		f = open(save,"wb")
@@ -153,56 +172,54 @@ def store_pdf(url,data,outdir):
 def _parse_pdf(filename):
 	''' the real parsing function '''
-	check_encryption(filename)
+	ret = check_encryption(filename)
-	return True
+	return ret
-	print('[+] Opening %s' % filename)
+def grab_url(url, args, outdir):
 	pdfile = open(filename,'rb')
 	try:
 		h = pdf.PdfFileReader(pdfile)
 	except pdf.utils.PdfReadError as e:
 		print('[-] Error: %s' % (e))
 		return
 	return pdfile
 def parse_single_pdf(filename):
 	''' single parse function '''
 	return 123
 def grab_url(url, outdir):
 	''' function keeping all the steps for the user call of grabbing 
 		just one pdf and analysing it
 	'''
-	data = download_pdf(url,None)
+	header_data={'User-Agent':get_random_agent()}
 	data = download_pdf(url,args, header_data)
 	if data != -1:
 		savepath = store_pdf(url, data, outdir)
 		_parse_pdf(savepath)
 	return
-def seek_and_analyse(search,sargs,outdir):
+def seek_and_analyse(search,args,outdir):
 	''' function for keeping all the steps of searching for pdfs and analysing
 		them together
 	'''
-	urls = search_pdf(search,sargs)
+	# use the search function of googlesearch to get the results
-	for url in urls:
+	urls = search_pdf(search,args)
 		grab_url(url,outdir)
-def search_pdf(search, sargs):
+
 	# *if* we get an answer
 	if urls != -1:
 		# process through the list and get the pdfs
 		for url in urls:
 			grab_url(url,args,outdir)
 def search_pdf(search, args):
 	''' the function where googlesearch from mario vilas
 		is called
 	'''
 	search_stop = args.search_stop
 	query='%s filetype:pdf' % search
 	#print(query)
 	urls = []
 	for url in gs.search(query,num=20,stop=sargs):
 		print(url)
 		urls.append(url)
 	try:
 		for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
 			print(url)
 			urls.append(url)
 	except urllib.error.HTTPError as e:
 		print('Error: %s' % e)
 		return -1
 	return urls
 def run(args):
@@ -217,7 +234,7 @@ def run(args):
 	if args.url_single:
 		url = args.url_single
 		print('[+] Grabbing %s' % (url))
-		grab_url(url, outdir)
+		grab_url(url, args,outdir)
 	elif args.file_single:
 		pdffile = args.file_single
@@ -226,10 +243,9 @@ def run(args):
 	elif args.search:
 		search = args.search
 		sargs = args.search_stop
 		#print(args)
 		print('[+] Seek and de...erm...analysing %s' % (search))
-		seek_and_analyse(search,sargs,outdir)
+		seek_and_analyse(search,args,outdir)
 	elif args.files_dir:
 		directory = args.files_dir
@@ -239,23 +255,26 @@ def run(args):
 			fpath = '%s/%s' % (directory,f)
 			_parse_pdf(fpath)
 	else:
 		print('[-] Dunno what to do, bro.')
-	#logfile = "%s/%s.txt" % (out,out)
+
-	#flog = open(logfile,"w")
+
 	return 42
 	# This is the end my friend.
 def main():
-	parser_desc = "%s %s %s" % (_name_,_version_,_author_)
+	parser_desc = "%s %s %s in %s" % (name,version,author,date)
-	parser = argparse.ArgumentParser(prog = __name__, description=parser_desc)
+	parser = argparse.ArgumentParser(prog = name, description=parser_desc)
-	parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
+	parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
 #	parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
 	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
 	#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
 #########
 	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
 	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
 	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
 	parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
 	parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
 	args = parser.parse_args()
 	run(args)