From e1d7c3f7600f7c5ca36585c7f958e0d4479f2fb1 Mon Sep 17 00:00:00 2001 From: c0decave Date: Tue, 5 Nov 2019 14:42:24 +0100 Subject: [PATCH] release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list --- docs/Changelog | 20 ++ docs/Todo | 4 + libs/__init__.py | 0 libs/libgoogle.py | 30 ++ libs/libhelper.py | 37 +++ libs/liblog.py | 17 ++ pdfgrab.py | 756 ++++++++++++++++++++++------------------------ 7 files changed, 476 insertions(+), 388 deletions(-) create mode 100644 docs/Changelog create mode 100644 docs/Todo create mode 100644 libs/__init__.py create mode 100644 libs/libgoogle.py create mode 100644 libs/libhelper.py create mode 100644 libs/liblog.py diff --git a/docs/Changelog b/docs/Changelog new file mode 100644 index 0000000..4fb9939 --- /dev/null +++ b/docs/Changelog @@ -0,0 +1,20 @@ +Changelog +========= + +Version 4.7 +----------- + +* added html out +* added xmp meta testing + +Version 4.6 +----------- + +* added help for non-argument given at cli +* added googlesearch lib + +Version 4.5 +----------- + +* exported helper functions to libs/helper.py +* added libs/liblog.py diff --git a/docs/Todo b/docs/Todo new file mode 100644 index 0000000..3dd5455 --- /dev/null +++ b/docs/Todo @@ -0,0 +1,4 @@ +* add xmp meta to output files +* code reordering +* clean up parsing functions +* add report formats diff --git a/libs/__init__.py b/libs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/libgoogle.py b/libs/libgoogle.py new file mode 100644 index 0000000..e374081 --- /dev/null +++ b/libs/libgoogle.py @@ -0,0 +1,30 @@ +import googlesearch as gs +import urllib +from libs.libhelper import * + +def get_random_agent(): + return (gs.get_random_user_agent()) + +def search_pdf(search, args): + ''' the function where googlesearch from mario vilas + is called + ''' + + search_stop = args.search_stop + + query = '%s filetype:pdf' % search + # print(query) + urls = [] + + try: + for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()): + #print(url) + urls.append(url) + + except urllib.error.HTTPError as e: + print('Error: %s' % e) + return -1 + + + return urls + diff --git a/libs/libhelper.py b/libs/libhelper.py new file mode 100644 index 0000000..1861e8b --- /dev/null +++ b/libs/libhelper.py @@ -0,0 +1,37 @@ +import os +import sys +from Crypto.Hash import SHA256 + +def make_directory(outdir): + ''' naive mkdir function ''' + try: + os.mkdir(outdir) + except: + # print("[W] mkdir, some error, directory probably exists") + pass + +def url_strip(url): + url = url.rstrip("\n") + url = url.rstrip("\r") + return url + +def create_sha256(hdata): + ''' introduced to create hashes of filenames, to have a uniqid + of course hashes of the file itself will be the next topic + ''' + hobject = SHA256.new(data=hdata.encode()) + return (hobject.hexdigest()) + +def find_name(pdf): + ''' simply parses the urlencoded name and extracts the storage name + i would not be surprised this naive approach can lead to fuckups + ''' + + # find the name of the file + name = pdf.split("/") + a = len(name) + name = name[a - 1] + # print(name) + + return name + diff --git a/libs/liblog.py b/libs/liblog.py new file mode 100644 index 0000000..2c01d25 --- /dev/null +++ b/libs/liblog.py @@ -0,0 +1,17 @@ +import logging + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +file_handler = logging.FileHandler('pdfgrab.log') + +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.WARNING) + +formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s') + +file_handler.setFormatter(formatter) +console_handler.setFormatter(formatter) + +logger.addHandler(file_handler) +logger.addHandler(console_handler) diff --git a/pdfgrab.py b/pdfgrab.py index 9ec84b0..41c9e14 100755 --- a/pdfgrab.py +++ b/pdfgrab.py @@ -1,61 +1,35 @@ #!/usr/bin/env python3 ##################### -# yay - old tool adjusted for python3, using googlesearch now -# and not some self crafted f00 -# # new features, new layout, new new :> -# by dash at the end of September 2019 -# -# TODO -# * add complete path in output as well as url where pdf came from -# -> if url not exist like -F mode, then the local path -# * clean up code -# * fine tune google search -# * add random timeout for new requests -# -> maybe not necessary, gs has it ... -# -> sort of necessary, on the other hand use proxychains man -# * uh oh some fancy c0l0rs -# * add thread support -# * add scrape mode, to search for pdfs at the website itself -# * add current error conditions to logfile -# -# Done -# * add url list to output -# * queues added, but no thread support yet -# * json file output -# * txt file output -# * outfilename hardcoded -# * add decryption routine -# * catch ssl exceptions -# * add random useragent for google and website pdf gathering -# * set option for certificate verification, default is true -# * catch conn refused connections -# * catch filename to long thingy +# by dash -import os -import sys +import xml +import argparse import json +import os import queue import urllib -import argparse -import requests +from json2html import * -# remove somewhen ;) -from IPython import embed - -from PyPDF2 import pdf import PyPDF2 -from Crypto.Hash import SHA256 -from collections import deque # googlesearch library import googlesearch as gs +import requests +from PyPDF2 import pdf + +# functions to extern files +from libs.liblog import logger +from libs.libhelper import * +from libs.libgoogle import * + +from IPython import embed # some variables in regard of the tool itself -name = 'pdfgrab' -version = '0.4.4' -author = 'dash' -date = '2019' +name = 'pdfgrab' +version = '0.4.7' +author = 'dash' +date = '2019' # queues for processing # this queue holds the URL locations of files to download @@ -68,76 +42,93 @@ pdf_q = queue.Queue() # this is the analysis queue, keeping the data for further processing ana_q = queue.Queue() -def create_sha256(hdata): - ''' introduced to create hashes of filenames, to have a uniqid - of course hashes of the file itself will be the next topic - ''' - hobject = SHA256.new(data=hdata.encode()) - return (hobject.hexdigest()) - -def process_queue_data(filename,data,queue_type): - ''' main function for processing gathered data - i use this central function for it, so it is at *one* place - and it is easy to change the data handling at a later step without - deconstructing the who code - ''' - ana_dict = {} - url_dict = {} - - if queue_type=='doc_info': - print('[v] Queue DocInfo Data %s' % (filename)) - name = find_name(filename) - path = filename - - # create a hash over the file path - # hm, removed for now - #path_hash = create_sha256(path) - - # order data in dict for analyse queue - ana_dict = {path : {'filename':name,'data':data}} -# print(data) -# print(ana_dict) - - # add the data to queue - add_queue(ana_q,ana_dict) - - elif queue_type=='url': - # prepare queue entry - print('[v] Url Queue %s' % (data)) - url_dict = {'url':data,'filename':filename} - sha256=create_sha256(data) - url_d[sha256]=url_dict - - # add dict to queue - add_queue(url_q,url_dict) - - else: - print('[-] Sorry, unknown queue. DEBUG!') - return False - - return True - def add_queue(tqueue, data): - ''' wrapper function for adding easy data to + ''' wrapper function for adding easy data to created queues. otherwise the functions will be scattered with endless queue commands ;) ''' - tqueue.put(data) - #d=tqueue.get() - #print(d) - return True + tqueue.put(data) + # d=tqueue.get() + #logging.debug(d) + return True -def url_strip(url): - url = url.rstrip("\n") - url = url.rstrip("\r") - return url +def process_queue_data(filename, data, queue_type): + ''' main function for processing gathered data + i use this central function for it, so it is at *one* place + and it is easy to change the data handling at a later step without + deconstructing the who code + ''' + ana_dict = {} + url_dict = {} -def get_random_agent(): - return (gs.get_random_user_agent()) + if queue_type == 'doc_info': + logger.info('Queue DocInfo Data {0}'.format(filename)) + name = find_name(filename) + path = filename + + # create a hash over the file path + # hm, removed for now + # path_hash = create_sha256(path) + + # order data in dict for analyse queue + ana_dict = {path: {'filename': name, 'data': data}} + #print('data:',data) + #print('ana_dcit:',ana_dict) + + # add the data to queue + add_queue(ana_q, ana_dict) + + elif queue_type == 'doc_xmp_info': + logger.info('Queue DocXMPInfo Data {0}'.format(filename)) + logger.warning('DocXMPInfo json processing not supported {0}'.format(filename)) + + elif queue_type == 'url': + # prepare queue entry + logger.info('Url Queue {0}'.format(data)) + url_dict = {'url': data, 'filename': filename} + sha256 = create_sha256(data) + url_d[sha256] = url_dict + + # add dict to queue + add_queue(url_q, url_dict) + + else: + print('[-] Sorry, unknown queue. DEBUG!') + logger.critical('Unknown queue') + return False + + return True + +def get_xmp_meta_data(filename, filehandle): + ''' get the xmp meta data + ''' + + err_dict = {} + real_extract = {} + xmp_dict = {} + + fh = filehandle + + try: + xmp_meta = fh.getXmpMetadata() + + except xml.parsers.expat.ExpatError as e: + print('Error: %s' % e) + err_dict = {'error': str(e)} + return -1 + + finally: + process_queue_data(filename, err_dict, 'doc_xmp_info') + + if xmp_meta != None: + print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject)) + xmp_dict = {} + + return xmp_dict def get_DocInfo(filename, filehandle): - ''' the easy way to extract metadata + ''' the easy way to extract metadata indirectObjects... there is an interesting situation, some pdfs seem to have the same information stored @@ -152,357 +143,346 @@ def get_DocInfo(filename, filehandle): bad example: ''' - err_dict = {} - real_extract = {} + err_dict = {} + real_extract = {} - fh = filehandle + fh = filehandle - try: - extract = fh.documentInfo + try: + extract = fh.documentInfo - except pdf.utils.PdfReadError as e: - print('Error: %s' % e) - err_dict={'error':str(e)} - return -1 + except pdf.utils.PdfReadError as e: + print('Error: %s' % e) + err_dict = {'error': str(e)} + return -1 - except PyPDF2.utils.PdfReadError as e: - print('Error: %s' % e) - err_dict={'error':str(e)} - return -1 + except PyPDF2.utils.PdfReadError as e: + print('Error: %s' % e) + err_dict = {'error': str(e)} + return -1 - finally: - process_queue_data(filename,err_dict,'doc_info') + finally: + process_queue_data(filename, err_dict, 'doc_info') - print('-'*80) - print('File: %s' % filename) -# embed() - # there are situations when documentinfo does not return anything - # and extract is None - if extract==None: - err_dict={'error':'getDocumentInfo() returns None'} - process_queue_data(filename,err_dict,'doc_info') - return -1 + print('-' * 80) + print('File: %s' % filename) + # embed() + # there are situations when documentinfo does not return anything + # and extract is None + if extract == None: + err_dict = {'error': 'getDocumentInfo() returns None'} + process_queue_data(filename, err_dict, 'doc_info') + return -1 + try: + for k in extract.keys(): + key = str(k) + value = str(extract[k]) + edata = '%s %s' % (key, value) + print(edata) + print + real_extract[key] = value + print('-' * 80) - try: - for k in extract.keys(): - key = str(k) - value = str(extract[k]) - edata = '%s %s' % (key,value) - print(edata) - print - real_extract[key]=value - print('-'*80) + except PyPDF2.utils.PdfReadError as e: + print('Error: %s' % e) + err_dict = {'error': str(e)} + process_queue_data(filename, err_dict, 'doc_info') + return -1 - except PyPDF2.utils.PdfReadError as e: - print('Error: %s' % e) - err_dict={'error':str(e)} - process_queue_data(filename,err_dict,'doc_info') - return -1 - - - process_queue_data(filename,real_extract,'doc_info') + process_queue_data(filename, real_extract, 'doc_info') def decrypt_empty_pdf(filename): - ''' this function simply tries to decrypt the pdf with the null password + ''' this function simply tries to decrypt the pdf with the null password this does work, as long as no real password has been set if a complex password has been set -> john ''' - fr = pdf.PdfFileReader(open(filename,"rb")) - try: - fr.decrypt('') + fr = pdf.PdfFileReader(open(filename, "rb")) + try: + fr.decrypt('') + + except NotImplementedError as e: + # print('Error: %s' % (e)) + print('Error: File: %s encrypted. %s' % (filename, str(e))) + return -1 + return fr - except NotImplementedError as e: - #print('Error: %s' % (e)) - print('Error: File: %s encrypted. %s' % (filename,str(e))) - return -1 - return fr - def check_encryption(filename): - ''' basic function to check if file is encrypted + ''' basic function to check if file is encrypted ''' -# print(filename) - try: - fr = pdf.PdfFileReader(open(filename,"rb")) - except pdf.utils.PdfReadError as e: - print('Error: %s' % e) - return -1 + # print(filename) + try: + fr = pdf.PdfFileReader(open(filename, "rb")) + except pdf.utils.PdfReadError as e: + print('Error: %s' % e) + return -1 - if fr.getIsEncrypted()==True: - print('[i] File encrypted %s' % filename) - nfr = decrypt_empty_pdf(filename) - if nfr != -1: - get_DocInfo(filename,nfr) + if fr.getIsEncrypted() == True: + print('[i] File encrypted %s' % filename) + nfr = decrypt_empty_pdf(filename) + if nfr != -1: + get_DocInfo(filename, nfr) + get_xmp_meta_data(filename,nfr) - else: - get_DocInfo(filename,fr) + else: + get_DocInfo(filename, fr) + get_xmp_meta_data(filename,fr) - #fr.close() + # fr.close() - return True + return True -def find_name(pdf): - ''' simply parses the urlencoded name and extracts the storage name - i would not be surprised this naive approach can lead to fuckups - ''' - - #find the name of the file - name = pdf.split("/") - a = len(name) - name = name[a-1] - #print(name) - - return name - -def make_directory(outdir): - ''' naive mkdir function ''' - try: - os.mkdir(outdir) - except: - #print("[W] mkdir, some error, directory probably exists") - pass def download_pdf(url, args, header_data): - ''' downloading the pdfile for later analysis ''' + ''' downloading the pdfile for later analysis ''' - # check the remote tls certificate or not? - cert_check = args.cert_check + # check the remote tls certificate or not? + cert_check = args.cert_check - try: - req = requests.get(url,headers=header_data,verify=cert_check) - #req = requests.get(url,headers=header_data,verify=False) - data = req.content - status_code = req.status_code + try: + req = requests.get(url, headers=header_data, verify=cert_check) + # req = requests.get(url,headers=header_data,verify=False) + data = req.content + status_code = req.status_code - except requests.exceptions.SSLError as e: - print('Error: %s' % e) - return -1 + except requests.exceptions.SSLError as e: + print('Error: %s' % e) + return -1 - except: - print('Error: Probably something wrong with remote server') - return -1 + except: + print('Error: Probably something wrong with remote server') + return -1 - if status_code == 403: - print('%s http/403 Forbidden' % (url)) - return -1 + if status_code == 403: + print('%s http/403 Forbidden' % (url)) + return -1 - #print(len(data)) - return data + # print(len(data)) + return data -def store_pdf(url,data,outdir): - ''' storing the downloaded pdf data - ''' - print('[v] store_pdf') - name = find_name(url) - # only allow stored file a name with 50 chars - if len(name)>50: - name = name[:49] + '.pdf' - #print(len(name)) +def store_pdf(url, data, outdir): + ''' storing the downloaded pdf data + ''' - save = "%s/%s" % (outdir,name) + logger.info('Store pdf') + name = find_name(url) - try: - f = open(save,"wb") - except OSError as e: - print('Error: %s' % (e)) - return -1 + # only allow stored file a name with 50 chars + if len(name) > 50: + name = name[:49] + '.pdf' + # print(len(name)) + + save = "%s/%s" % (outdir, name) + + try: + f = open(save, "wb") + except OSError as e: + print('Error: %s' % (e)) + return -1 + + ret = f.write(data) + logger.info('Written {0} bytes for file: {1}'.format(ret,save)) + f.close() + + # return the savepath + return save - ret=f.write(data) - print('[+] Written %d bytes for File: %s' % (ret,save)) - f.close() - - # return the savepath - return save def _parse_pdf(filename): - ''' the real parsing function ''' + ''' the real parsing function ''' + + ret = check_encryption(filename) + return ret - ret = check_encryption(filename) - return ret def grab_url(url, args, outdir): - ''' function keeping all the steps for the user call of grabbing - just one pdf and analysing it - ''' - header_data={'User-Agent':get_random_agent()} - data = download_pdf(url,args, header_data) - if data != -1: - savepath = store_pdf(url, data, outdir) - _parse_pdf(savepath) + ''' function keeping all the steps for the user call of grabbing + just one pdf and analysing it + ''' + header_data = {'User-Agent': get_random_agent()} + data = download_pdf(url, args, header_data) + if data != -1: + savepath = store_pdf(url, data, outdir) + _parse_pdf(savepath) - return - -def seek_and_analyse(search,args,outdir): - ''' function for keeping all the steps of searching for pdfs and analysing - them together - ''' - # use the search function of googlesearch to get the results - search_pdf(search,args) - #urls = search_pdf(search,args) + return - # *if* we get an answer - if url_q.empty()==False: - #if urls != -1: - # process through the list and get the pdfs - while url_q.empty()==False: - item=url_q.get() - #print(item) - url = item['url'] - grab_url(url,args,outdir) +def seek_and_analyse(search, args, outdir): + ''' function for keeping all the steps of searching for pdfs and analysing + them together + ''' + # use the search function of googlesearch to get the results + urls=search_pdf(search, args) + for item in urls: + filename = find_name(item) + process_queue_data(filename, item, 'url') -def search_pdf(search, args): - ''' the function where googlesearch from mario vilas - is called - ''' + # urls = search_pdf(search,args) - search_stop = args.search_stop + # *if* we get an answer + if url_q.empty() == False: + # if urls != -1: + # process through the list and get the pdfs + while url_q.empty() == False: + item = url_q.get() + # print(item) + url = item['url'] + grab_url(url, args, outdir) - query='%s filetype:pdf' % search - #print(query) - urls = [] - try: - for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()): - #print(url) - # parse out the name of the file in the url - filename=find_name(url) - # add the file to queue - process_queue_data(filename,url,'url') - urls.append(url) - - except urllib.error.HTTPError as e: - print('Error: %s' % e) - return -1 - #return urls def run(args): - # outfile name - if args.outfile: - out_filename = args.outfile - else: - out_filename = 'pdfgrab_analysis' + # initialize logger + logger.info('{0} Started'.format(name)) - # specify output directory - outdir = args.outdir + # outfile name + if args.outfile: + out_filename = args.outfile + else: + out_filename = 'pdfgrab_analysis' - # create output directory - make_directory(outdir) + # specify output directory + outdir = args.outdir - # lets see what the object is - if args.url_single: - url = args.url_single - print('[+] Grabbing %s' % (url)) - grab_url(url, args,outdir) + # create output directory + make_directory(outdir) - elif args.file_single: - pdffile = args.file_single - print('[+] Parsing %s' % (pdffile)) - _parse_pdf(pdffile) + # lets see what the object is + if args.url_single: + url = args.url_single + logger.info('Grabbing {0}'.format(url)) + logger.write_to_log('Grabbing %s' % (url)) + grab_url(url, args, outdir) - elif args.search: - search = args.search - #print(args) - print('[+] Seek and de...erm...analysing %s' % (search)) - seek_and_analyse(search,args,outdir) - - elif args.files_dir: - directory = args.files_dir - print('[+] Analyse pdfs in directory %s' % (directory)) - try: - files = os.listdir(directory) - except: - print('Error') - return False + elif args.file_single: + pdffile = args.file_single + logger.info('Parsing {0}'.format(pdffile)) + _parse_pdf(pdffile) - for f in files: - # naive filter function, later usage of filemagic possible - if f.find('.pdf')!=-1: - fpath = '%s/%s' % (directory,f) - _parse_pdf(fpath) + elif args.search: + search = args.search + logger.info('Seek and analyse {0}'.format(search)) + seek_and_analyse(search, args, outdir) - else: - print('[-] Dunno what to do, bro.') - - # move analysis dictionary in queue back to dictionary - analysis_dict = {} - while ana_q.empty()==False: - item = ana_q.get() - #print('item ', item) - analysis_dict.update(item) - - # ana_q is empty now + elif args.files_dir: + directory = args.files_dir + logger.info('Analyse pdfs in directory {0}'.format(directory)) + try: + files = os.listdir(directory) + except: + logger.warning('Error in args.files_dir') + return False - # create txt output - sep = '-'*80 + '\n' - txtout = "%s/%s.txt" % (outdir,out_filename) - fwtxt = open(txtout,'w') - #print(analysis_dict) - for k in analysis_dict.keys(): - fwtxt.write(sep) - fname = 'File: %s\n' % (analysis_dict[k]['filename']) - ddata = analysis_dict[k]['data'] - fwtxt.write(fname) - for kdata in ddata.keys(): - metatxt = '%s:%s\n' % (kdata,ddata[kdata]) - fwtxt.write(metatxt) - fwtxt.write(sep) - fwtxt.close() - - # create json output - jsonout = "%s/%s.json" % (outdir,out_filename) - fwjson = open(jsonout,'w') - #for k in analysis_dict.keys(): - #print(analysis_dict[k]) - # jdata = json.dumps(analysis_dict[k]) + for f in files: + # naive filter function, later usage of filemagic possible + if f.find('.pdf') != -1: + fpath = '%s/%s' % (directory, f) + _parse_pdf(fpath) - #print(analysis_dict) - jdata = json.dumps(analysis_dict) - fwjson.write(jdata) - fwjson.close() + else: + print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0])) - # create url savefile - #print('url_d: ', url_d) - jsonurlout = "%s/%s_url.json" % (outdir,out_filename) - fwjson = open(jsonurlout,'w') - jdata = json.dumps(url_d) - fwjson.write(jdata) - fwjson.close() + # move analysis dictionary in queue back to dictionary + analysis_dict = {} + while ana_q.empty() == False: + item = ana_q.get() + # print('item ', item) + analysis_dict.update(item) + + #print('dict:',analysis_dict) + # ana_q is empty now + + # create txt output + sep = '-' * 80 + '\n' + txtout = "%s/%s.txt" % (outdir, out_filename) + fwtxt = open(txtout, 'w') + # print(analysis_dict) + for k in analysis_dict.keys(): + fwtxt.write(sep) + fname = 'File: %s\n' % (analysis_dict[k]['filename']) + ddata = analysis_dict[k]['data'] + fwtxt.write(fname) + for kdata in ddata.keys(): + metatxt = '%s:%s\n' % (kdata, ddata[kdata]) + fwtxt.write(metatxt) + fwtxt.write(sep) + fwtxt.close() + + # create json output + jsonout = "%s/%s.json" % (outdir, out_filename) + fwjson = open(jsonout, 'w') + + # print(analysis_dict) + jdata = json.dumps(analysis_dict) + fwjson.write(jdata) + fwjson.close() + + # create html from json + htmlout = "%s/%s.html" % (outdir, out_filename) + fwhtml = open(htmlout,'w') + #print(jdata) + html = json2html.convert(json = jdata) + fwhtml.write(html) + fwhtml.close() + + + # create url savefile + # print('url_d: ', url_d) + jsonurlout = "%s/%s_url.json" % (outdir, out_filename) + fwjson = open(jsonurlout, 'w') + jdata = json.dumps(url_d) + fwjson.write(jdata) + fwjson.close() + + txtout = "%s/%s_url.txt" % (outdir, out_filename) + fwtxt = open(txtout, 'w') + for k in url_d.keys(): + ddata = url_d[k] + metatxt = '%s:%s\n' % (ddata['url'], ddata['filename']) + fwtxt.write(metatxt) + fwtxt.close() + + return 42 - txtout = "%s/%s_url.txt" % (outdir,out_filename) - fwtxt = open(txtout,'w') - for k in url_d.keys(): - ddata = url_d[k] - metatxt = '%s:%s\n' % (ddata['url'], ddata['filename']) - fwtxt.write(metatxt) - fwtxt.close() - - return 42 - # This is the end my friend. +# This is the end my friend. def main(): - parser_desc = "%s %s %s in %s" % (name,version,author,date) - parser = argparse.ArgumentParser(prog = name, description=parser_desc) - parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') - parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path") - parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None) - #parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None) -######### - parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None) - parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None) - parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None) - parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int) - parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True) + parser_desc = "%s %s %s in %s" % (name, version, author, date) + parser = argparse.ArgumentParser(prog=name, description=parser_desc) + parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False, + help="define the outdirectory for downloaded files and analysis output", default='pdfgrab') + parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False, + help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path") + parser.add_argument('-u', '--url', action='store', dest='url_single', required=False, + help="grab pdf from specified url for analysis", default=None) + # parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None) + ######### + parser.add_argument('-f', '--file', action='store', dest='file_single', required=False, + help="specify local path of pdf for analysis", default=None) + parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False, + help="specify local path of *directory* with pdf *files* for analysis", default=None) + parser.add_argument('-s', '--search', action='store', dest='search', required=False, + help="specify domain or tld to scrape for pdf-files", default=None) + parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False, + help="specify how many files are searched", default=10, type=int) + parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False, + help="if the target domain(s) run with old or bad certificates", default=True) + + if len(sys.argv)<2: + parser.print_help(sys.stderr) + sys.exit() + + args = parser.parse_args() + run(args) - args = parser.parse_args() - run(args) if __name__ == "__main__": - main() + main()