diff --git a/Readme.md b/Readme.md index 079b10d..80fd4ce 100644 --- a/Readme.md +++ b/Readme.md @@ -123,14 +123,14 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf ``` ## TODO -* json output -* txt output +* json file-output +* txt file-output * catch conn refused connections -* set option for certificate verification, default is false +* ~~set option for certificate verification, default is true~~ * complete analyse.txt and seperated * clean up code * do more testing -* add random useragent for google and website pdf gathering +* ~~add random useragent for google and website pdf gathering~~ * ~~add decryption routine~~ * ~~catch ssl exceptions~~ diff --git a/pdfgrab.py b/pdfgrab.py index e696639..79e99b9 100755 --- a/pdfgrab.py +++ b/pdfgrab.py @@ -4,45 +4,55 @@ # and not some self crafted f00 # # new features, new layout, new new :> -# dash in end of September 2019 +# by dash at the end of September 2019 # -# # TODO -# * json output -# * txt output -# * catch conn refused connections -# * set option for certificate verification, default is false +# * json file output +# * txt file output # * complete analyse.txt and seperated # * clean up code # * do more testing -# * add random useragent for google and website pdf gathering +# * fine tune google search +# * add random timeout for new requests +# -> maybe not necessary, gs has it ... +# -> sort of necessary, on the other hand use proxychains man +# * uh oh some fancy c0l0rs +# * catch filename to long thingy # # Done # * add decryption routine # * catch ssl exceptions +# * add random useragent for google and website pdf gathering +# * set option for certificate verification, default is true +# * catch conn refused connections import os import sys import argparse import requests +import urllib from IPython import embed from PyPDF2 import pdf import googlesearch as gs -_name_ = 'pdfgrab' -_version_ = '0.3' -_author_ = 'dash' -_date_ = '2019' +name = 'pdfgrab' +version = '0.4' +author = 'dash' +date = '2019' def url_strip(url): url = url.rstrip("\n") url = url.rstrip("\r") return url +def get_random_agent(): + return (gs.get_random_user_agent()) def get_DocInfo(filename, filehandle): + ''' the easy way to extract metadata + ''' fh = filehandle try: @@ -117,10 +127,14 @@ def make_directory(outdir): #print("[W] mkdir, some error, directory probably exists") pass -def download_pdf(url, header_data): +def download_pdf(url, args, header_data): ''' downloading the pdfile for later analysis ''' + + # check the remote tls certificate or not? + cert_check = args.cert_check + try: - req = requests.get(url,headers=header_data,verify=True) + req = requests.get(url,headers=header_data,verify=cert_check) #req = requests.get(url,headers=header_data,verify=False) data = req.content except requests.exceptions.SSLError as e: @@ -134,8 +148,13 @@ def download_pdf(url, header_data): return data def store_pdf(url,data,outdir): - ''' storing the downloaded pdf data ''' + ''' storing the downloaded pdf data + ''' name = find_name(url) + + # only allow stored file a name with 50 chars + name = name[:49] + '.pdf' + print(len(name)) save = "%s/%s" % (outdir,name) try: f = open(save,"wb") @@ -153,56 +172,54 @@ def store_pdf(url,data,outdir): def _parse_pdf(filename): ''' the real parsing function ''' - check_encryption(filename) - return True + ret = check_encryption(filename) + return ret - print('[+] Opening %s' % filename) - pdfile = open(filename,'rb') - - try: - h = pdf.PdfFileReader(pdfile) - except pdf.utils.PdfReadError as e: - print('[-] Error: %s' % (e)) - return - - return pdfile - - -def parse_single_pdf(filename): - ''' single parse function ''' - return 123 - -def grab_url(url, outdir): +def grab_url(url, args, outdir): ''' function keeping all the steps for the user call of grabbing just one pdf and analysing it ''' - data = download_pdf(url,None) + header_data={'User-Agent':get_random_agent()} + data = download_pdf(url,args, header_data) if data != -1: savepath = store_pdf(url, data, outdir) _parse_pdf(savepath) return -def seek_and_analyse(search,sargs,outdir): +def seek_and_analyse(search,args,outdir): ''' function for keeping all the steps of searching for pdfs and analysing them together ''' - urls = search_pdf(search,sargs) - for url in urls: - grab_url(url,outdir) + # use the search function of googlesearch to get the results + urls = search_pdf(search,args) -def search_pdf(search, sargs): + + # *if* we get an answer + if urls != -1: + # process through the list and get the pdfs + for url in urls: + grab_url(url,args,outdir) + +def search_pdf(search, args): ''' the function where googlesearch from mario vilas is called ''' + search_stop = args.search_stop + query='%s filetype:pdf' % search #print(query) urls = [] - for url in gs.search(query,num=20,stop=sargs): - print(url) - urls.append(url) + + try: + for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()): + print(url) + urls.append(url) + except urllib.error.HTTPError as e: + print('Error: %s' % e) + return -1 return urls def run(args): @@ -217,7 +234,7 @@ def run(args): if args.url_single: url = args.url_single print('[+] Grabbing %s' % (url)) - grab_url(url, outdir) + grab_url(url, args,outdir) elif args.file_single: pdffile = args.file_single @@ -226,10 +243,9 @@ def run(args): elif args.search: search = args.search - sargs = args.search_stop #print(args) print('[+] Seek and de...erm...analysing %s' % (search)) - seek_and_analyse(search,sargs,outdir) + seek_and_analyse(search,args,outdir) elif args.files_dir: directory = args.files_dir @@ -239,23 +255,26 @@ def run(args): fpath = '%s/%s' % (directory,f) _parse_pdf(fpath) - - - else: print('[-] Dunno what to do, bro.') - #logfile = "%s/%s.txt" % (out,out) - #flog = open(logfile,"w") + + + return 42 + # This is the end my friend. def main(): - parser_desc = "%s %s %s" % (_name_,_version_,_author_) - parser = argparse.ArgumentParser(prog = __name__, description=parser_desc) - parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') + parser_desc = "%s %s %s in %s" % (name,version,author,date) + parser = argparse.ArgumentParser(prog = name, description=parser_desc) + parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') +# parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt') parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None) + #parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None) +######### parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None) parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None) parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None) parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int) + parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True) args = parser.parse_args() run(args)