release version 0.4.9

2019-11-07 12:51:30 +01:00
parent a89ac93c3d
commit 132750867f
10 changed files with 477 additions and 161 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -1,6 +1,6 @@
 # pdfgrab
-* Version 0.4.8-Pre
+* Version 0.4.9
 ## What is it?
@@ -9,21 +9,18 @@ Basically it analyses PDF files for Metadata. You can direct it to a file or dir
 You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
 to search for pdfs at target site, download and analyse them.
-## What is new in 0.4.8 bug fix pre-release?
+## What is new in 0.4.9?
-* catching google error at too many requests
+* exported reporting methods to libreport.py
-* catching dns resolve urlopen error at googlelib
+* added optargs for disabling different report methods
-* fixing annoying bug in regard of pdfs behind urls like http://host/pdf/
+* made the html report a bit more shiny
-* fixing zero size pdf error(online linked pdfs which are not accessable)
+* added function for generating html report after analysis
-* added some logging
+* exported requests and storing data to new library
-
+* code fixes and more clear error handling
-## What is new in 0.4.7 release?
+* removed necessary site: parameter at search flag -s
-
+* updated readme
-* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file
+* -s flag now acceppts several domains
-* Added basic logging support, logfile is placed in pdfgrab.py directory
+* console logging more clean
 * Reordered Codebase, exported functionality to some libraries
 * PDF XMP Metadata is grabbed now as well, but not yet saved in output files
 * added docs/ section with Changelog and Todo
 ## What information can be gathered?
@@ -132,7 +129,7 @@ Will analyse all pdf's in that directory
 ### Google Search Mode
 ```
-# ./pdfgrab.py -s site:kernel.org
+# ./pdfgrab.py -s kernel.org
 ```
 Result:
 ```
@@ -164,6 +161,26 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
 /PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
 ```
 ### Google Search Mode, several domains
 ```
 # ./pdfgrab.py -s example.com,example.us
 ```
 ### Reporting
 pdfgrab outputs the information in different formats. If not disabled by one of the reporting flags (see -h) you will
 find in the output directory:
 * html report
 * text report
 * text url list
 * json data
 * json url list
 ### Logging
 pdfgrab creates a logfile in the running directory called "pdfgrab.log"
 ## Google
 * Search: filetype:pdf site:com
--- a/docs/Changelog
+++ b/docs/Changelog
@@ -1,6 +1,20 @@
 Changelog
 =========
 Version 4.9
 -----------
 * exported reporting methods to libreport.py
 * added optargs for disabling different report methods
 * made the html report a bit more shiny
 * added function for generating html report after analysis
 * exported requests and storing data to new library
 * code fixes and more clear error handling
 * removed necessary site: parameter at search flag -s
 * updated readme
 * -s flag now acceppts several domains
 * console logging more clean
 Version 4.8 Bugfix-PreRelease
 -----------------------------
--- a/libs/libgoogle.py
+++ b/libs/libgoogle.py
@@ -5,19 +5,44 @@ from libs.libhelper import *
 def get_random_agent():
    return (gs.get_random_user_agent())
-def search_pdf(search, args):
+def hits_google(search, args):
    ''' the function where googlesearch from mario vilas
 		is called
 	'''
    s = search.split(',')
    query = 'filetype:pdf'
    try:
        hits = gs.hits(query, domains=s,user_agent=gs.get_random_user_agent())
    except urllib.error.HTTPError as e:
        return False,e
    except urllib.error.URLError as e:
        return False,e
    except IndexError as e:
        return False,e
    return True,hits
 def search_google(search, args):
    ''' the function where googlesearch from mario vilas
 		is called
 	'''
    s = search.split(',')
    search_stop = args.search_stop
-    query = '%s filetype:pdf' % search
+    query = 'filetype:pdf'
    #query = 'site:%s filetype:pdf' % search
    # print(query)
    urls = []
    try:
-        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
+        for url in gs.search(query, num=20, domains=s,stop=search_stop, user_agent=gs.get_random_user_agent()):
            #print(url)
            urls.append(url)
--- a/libs/liblog.py
+++ b/libs/liblog.py
@@ -8,10 +8,12 @@ file_handler = logging.FileHandler('pdfgrab.log')
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.WARNING)
-formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
+file_formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
 console_formatter = logging.Formatter('%(levelname)s:%(message)s')
-file_handler.setFormatter(formatter)
+
-console_handler.setFormatter(formatter)
+file_handler.setFormatter(file_formatter)
 console_handler.setFormatter(console_formatter)
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
--- a/libs/libreport.py
+++ b/libs/libreport.py
@@ -0,0 +1,173 @@
 import os
 import sys
 import json
 from json2html import * 
 from libs.pdf_png import get_png_base64
 def prepare_analysis_dict(ana_queue):
    '''params: ana_queue - queue with collected information
    '''
    # initiate analysis dictionary
    analysis_dict = {}
    # move analysis dictionary in queue back to dictionary
    while ana_queue.empty() == False:
        item = ana_queue.get()
        # print('item ', item)
        analysis_dict.update(item)
    # ana_q is empty now return the newly created dictionary
    return analysis_dict
 def create_txt_report(analysis_dict, outdir, out_filename):
    ''' create a txt report in the output directory
    '''
    # draw seperator lines
    sep = '-' * 80 + '\n'
    # create output filepath
    txtout = "%s/%s.txt" % (outdir, out_filename)
    # open the file and return filedescriptor
    fwtxt = open(txtout, 'w')
    # get the keys of the dict
    for k in analysis_dict.keys():
        # write seperator
        fwtxt.write(sep)
        # build entry filename of the pdf
        fname = 'File: %s\n' % (analysis_dict[k]['filename'])
        # build data entry
        ddata = analysis_dict[k]['data']
        # write the filename
        fwtxt.write(fname)
        # write the metadata
        for kdata in ddata.keys():
            metatxt = '%s:%s\n' % (kdata, ddata[kdata])
            fwtxt.write(metatxt)
        # write seperator
        fwtxt.write(sep)
    # close the file
    fwtxt.close()
    return True
 def create_json_report(analysis_dict, outdir, out_filename):
    ''' create a jsonfile report in the output directory
    '''
    # build json output name
    jsonout = "%s/%s.json" % (outdir, out_filename)
    # open up json output file
    fwjson = open(jsonout, 'w')
    # convert dictionary to json data
    jdata = json.dumps(analysis_dict)
    # write json data to file 
    fwjson.write(jdata)
    # close file
    fwjson.close()
    return True
 def create_html_report(analysis_dict, outdir, out_filename):
    ''' create a html report from json file using json2html in the output directory
    '''
    # build up path for html output file
    htmlout = "%s/%s.html" % (outdir, out_filename)
    # open htmlout filedescriptor
    fwhtml = open(htmlout,'w')
    # some html stuff
    pdfpng=get_png_base64('supply/pdf_base64.png')
    html_style ='<style>.center { display: block; margin-left: auto;margin-right: auto;} table {border-collapse: collapse;} th, td { border: 1px solid black;text-align: left; }</style>\n'
    html_head = '<html><head><title>pdfgrab - {0} item/s</title>{1}</head>\n'.format(len(analysis_dict),html_style)
    html_pdf_png = '<p class="center"><img class="center" src="data:image/jpeg;base64,{0}"><br><center>pdfgrab - grab and analyse pdf files</center><br></p>'.format(pdfpng)
    html_body = '<body>{0}\n'.format(html_pdf_png)
    html_end = '\n<br><br><p align="center"><a href="https://github.com/c0decave/pdfgrab">pdfgrab</a> by <a href="https://twitter.com/User_to_Root">dash</a></p></body></html>\n'
    # some attributes
    attr = 'id="meta-data" class="table table-bordered table-hover", border=1, cellpadding=3 summary="Metadata"'
    # convert dictionary to json data
    # in this mode each finding gets its own table there are other possibilities
    # but now i go with this
    html_out = ''
    for k in analysis_dict.keys():
        trans = analysis_dict[k]
        jdata = json.dumps(trans)
        html = json2html.convert(json = jdata, table_attributes=attr)
        html_out = html_out + html + "\n"
        #html_out = html_out + "<p>" + html + "</p>\n"
    #jdata = json.dumps(analysis_dict)
    # create html
    #html = json2html.convert(json = jdata, table_attributes=attr)
    # write html
    fwhtml.write(html_head)
    fwhtml.write(html_body)
    fwhtml.write(html_out)
    fwhtml.write(html_end)
    # close html file
    fwhtml.close()
 def create_url_json(url_d, outdir, out_filename):
    ''' create a json url file in output directory
    '''
    # create url savefile
    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
    # open up file for writting urls down
    fwjson = open(jsonurlout, 'w')
    # convert url dictionary to json
    jdata = json.dumps(url_d)
    # write json data to file
    fwjson.write(jdata)
    # close filedescriptor
    fwjson.close()
    return True
 def create_url_txt(url_d, outdir, out_filename):
    ''' create a txt url file in output directory
    '''
    # build up txt out path
    txtout = "%s/%s_url.txt" % (outdir, out_filename)
    # open up our url txtfile
    fwtxt = open(txtout, 'w')
    # iterating through the keys of the url dictionary
    for k in url_d.keys():
        # get the entry
        ddata = url_d[k]
        # create meta data for saving
        metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
        # write metadata to file
        fwtxt.write(metatxt)
    # close fd
    fwtxt.close()
    return True
--- a/libs/librequest.py
+++ b/libs/librequest.py
@@ -0,0 +1,162 @@
 import os
 import sys
 import json
 import socket
 import requests
 from libs.liblog import logger
 from libs.libhelper import *
 from libs.libgoogle import get_random_agent
 def store_file(url, data, outdir):
    ''' storing the downloaded data to a file
        params: url     - is used to create the filename
                data    - the data of the file
                outdir  - to store in which directory
                returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
    '''
    logger.info('Store file {0}'.format(url))
    name = find_name(url)
    # only allow stored file a name with 50 chars
    if len(name) > 50:
        name = name[:49]
    # build up the save path
    save = "%s/%s" % (outdir, name)
    try:
        f = open(save, "wb")
    except OSError as e:
        logger.warning('store_file {0}'.format(e))
        # return ret_dict
        return {"code":False,"data":save,"error":e}
    # write the data and return the written bytes
    ret = f.write(data)
    # check if bytes are zero
    if ret == 0:
        logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
    else:
        # log to info that bytes and file has been written
        logger.info('Written {0} bytes for file: {1}'.format(ret,save))
    # close file descriptor
    f.close()
    # return ret_dict
    return {"code":True,"data":save,"error":False}
 def download_file(url, args, header_data):
    ''' downloading the file for later analysis 
        params: url         - the url
                args        - argparse args namespace
                header_data - pre-defined header data
        returns: ret_dict
    '''
    # check the remote tls certificate or not?
    cert_check = args.cert_check
    # run our try catch routine
    try:
        # request the url and save the response in req
        # give header data and set verify as delivered by args.cert_check
        req = requests.get(url, headers=header_data, verify=cert_check)
    except requests.exceptions.SSLError as e:
        logger.warning('download file {0}{1}'.format(url,e))
        # return retdict
        return {"code":False,"data":req,"error":e}
    except requests.exceptions.InvalidSchema as e:
        logger.warning('download file {0}{1}'.format(url,e))
        # return retdict
        return {"code":False,"data":False,"error":e}
    except socket.gaierror as e:
        logger.warning('download file, host not known {0} {1}'.format(url,e))
        return {"code":False,"data":False,"error":e}
    except:
        logger.warning('download file, something wrong with remote server? {0}'.format(url))
        # return retdict
        if not req in locals():
            req = False
        return {"code":False,"data":req,"error":True}
    #finally:
        # lets close the socket
        #req.close()
    # return retdict
    return {"code":True,"data":req,"error":False}
 def grab_run(url, args, outdir):
    ''' function keeping all the steps for the user call of grabbing
 	just one and analysing it
    '''
    header_data = {'User-Agent': get_random_agent()}
    rd_download = download_file(url, args, header_data)
    code_down = rd_download['code']
    # is code True download of file was successfull
    if code_down:
        rd_evaluate = evaluate_response(rd_download)
        code_eval = rd_evaluate['code']
        # if code is True, evaluation was also successful
        if code_eval:
            # get the content from the evaluate dictionary request
            content = rd_evaluate['data'].content
            # call store file 
            rd_store = store_file(url, content, outdir)
            # get the code
            code_store = rd_store['code']
            # get the savepath
            savepath = rd_store['data']
            # if code is True, storing of file was also successfull
            if code_store:
                return {"code":True,"data":savepath,"error":False}
    return {"code":False,"data":False,"error":True}
 def evalute_content(ret_dict):
    pass
 def evaluate_response(ret_dict):
    ''' this method comes usually after download_file,
        it will evaluate what has happened and if we even have some data to process
        or not
        params: data    - is the req object from the conducted request
        return: {}
        returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
        '''
    # extract data from ret_dict
    req = ret_dict['data']
    # get status code
    url = req.url
    status = req.status_code
    reason = req.reason
    # ahh everything is fine 
    if status == 200:
        logger.info('download file, {0} {1} {2}'.format(url,reason,status))
        return {"code":True,"data":req,"error":False}
    # nah something is not like it should be
    else:
        logger.warning('download file, {0} {1} {2}'.format(url,reason,status))
        return {"code":False,"data":req,"error":True}
--- a/libs/pdf_png.py
+++ b/libs/pdf_png.py
@@ -0,0 +1,5 @@
 def get_png_base64(filename):
    fr = open(filename,'r')
    buf = fr.read()
    return buf
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -22,12 +22,14 @@ from PyPDF2 import pdf
 from libs.liblog import logger
 from libs.libhelper import *
 from libs.libgoogle import *
 from libs.libreport import *
 from libs.librequest import grab_run
 from IPython import embed
 # some variables in regard of the tool itself
 name = 'pdfgrab'
-version = '0.4.8-Pre'
+version = '0.4.9'
 author = 'dash'
 date = 'November 2019'
@@ -243,72 +245,6 @@ def check_encryption(filename):
    return True
 def download_pdf(url, args, header_data):
    ''' downloading the pdfile for later analysis '''
    # check the remote tls certificate or not?
    cert_check = args.cert_check
    try:
        req = requests.get(url, headers=header_data, verify=cert_check)
        # req = requests.get(url,headers=header_data,verify=False)
        data = req.content
        status_code = req.status_code
    except requests.exceptions.SSLError as e:
        logger.warning('download pdf {0}{1}'.format(url,e))
        return -1
    except:
        logger.warning('download pdf, something wrong with remote server? {0}'.format(url))
        return -1
    if status_code == 403:
        logger.warning('download pdf, 403 Forbidden {0}'.format(url))
        return -1
    # print(len(data))
    return data
 def store_pdf(url, data, outdir):
    ''' storing the downloaded pdf data
    '''
    logger.info('Store pdf {0}'.format(url))
    name = find_name(url)
    #logger.warning(url)
    #logger.warning(name)
    #logger.warning(outdir)
    # only allow stored file a name with 50 chars
    if len(name) > 50:
        name = name[:49] + '.pdf'
    # print(len(name))
    save = "%s/%s" % (outdir, name)
    try:
        f = open(save, "wb")
    except OSError as e:
        logger.warning('store_pdf {0}'.format(e))
        return -1
    ret = f.write(data)
    logger.info('Written {0} bytes for file: {1}'.format(ret,save))
    f.close()
    if ret == 0:
        logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
        return save
        #return -1
    # return the savepath
    return save
 def _parse_pdf(filename):
    ''' the real parsing function '''
@@ -320,26 +256,18 @@ def _parse_pdf(filename):
        logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
        return False
 def grab_url(url, args, outdir):
    ''' function keeping all the steps for the user call of grabbing
 	just one pdf and analysing it
    '''
    header_data = {'User-Agent': get_random_agent()}
    data = download_pdf(url, args, header_data)
    if data != -1:
        savepath = store_pdf(url, data, outdir)
        _parse_pdf(savepath)
    return
 def seek_and_analyse(search, args, outdir):
    ''' function for keeping all the steps of searching for pdfs and analysing
        them together
    '''
    # check how many hits we got
    # seems like the method is broken in googlsearch library :(
    #code, hits = hits_google(search,args)
    #if code:
    #    print('Got {0} hits'.format(hits))
    # use the search function of googlesearch to get the results
-    code, values=search_pdf(search, args)
+    code, values=search_google(search, args)
    if not code:
        if values.code == 429:
            logger.warning('[-] Too many requests, time to change ip address or use proxychains')
@@ -362,7 +290,11 @@ def seek_and_analyse(search, args, outdir):
            item = url_q.get()
            # print(item)
            url = item['url']
-            grab_url(url, args, outdir)
+            rd_grabrun = grab_run(url, args, outdir)
            code = rd_grabrun['code']
            savepath = rd_grabrun['data']
            if code:
                _parse_pdf(savepath)
    return True
@@ -372,6 +304,9 @@ def run(args):
    # initialize logger
    logger.info('{0} Started'.format(name))
    # create some variables
    # outfile name
    if args.outfile:
        out_filename = args.outfile
@@ -381,6 +316,7 @@ def run(args):
    # specify output directory
    outdir = args.outdir
    # create output directory
    make_directory(outdir)
@@ -417,68 +353,43 @@ def run(args):
                fpath = '%s/%s' % (directory, f)
                _parse_pdf(fpath)
    # simply generate html report from json outfile
    elif args.gen_html_report:
        fr = open(args.gen_html_report,'r')
        analysis_dict = json.loads(fr.read())
        if create_html_report(analysis_dict, outdir,out_filename):
            logger.info('Successfully created html report') 
            sys.exit(0)
        else:
            sys.exit(1)
    else:
        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
        sys.exit(1)
-    # move analysis dictionary in queue back to dictionary
+    # creating the analysis dictionary for reporting
-    analysis_dict = {}
+    analysis_dict = prepare_analysis_dict(ana_q)
    while ana_q.empty() == False:
        item = ana_q.get()
        # print('item ', item)
        analysis_dict.update(item)
-    #print('dict:',analysis_dict)
+    # lets go through the different reporting types
-    # ana_q is empty now
+    if args.report_txt:
        if create_txt_report(analysis_dict, outdir,out_filename):
            logger.info('Successfully created txt report') 
-    # create txt output
+    if args.report_json:
-    sep = '-' * 80 + '\n'
+        if create_json_report(analysis_dict, outdir,out_filename):
-    txtout = "%s/%s.txt" % (outdir, out_filename)
+            logger.info('Successfully created json report') 
    fwtxt = open(txtout, 'w')
    # print(analysis_dict)
    for k in analysis_dict.keys():
        fwtxt.write(sep)
        fname = 'File: %s\n' % (analysis_dict[k]['filename'])
        ddata = analysis_dict[k]['data']
        fwtxt.write(fname)
        for kdata in ddata.keys():
            metatxt = '%s:%s\n' % (kdata, ddata[kdata])
            fwtxt.write(metatxt)
        fwtxt.write(sep)
    fwtxt.close()
-    # create json output
+    if args.report_html:
-    jsonout = "%s/%s.json" % (outdir, out_filename)
+        if create_html_report(analysis_dict, outdir,out_filename):
-    fwjson = open(jsonout, 'w')
+            logger.info('Successfully created html report') 
-    # print(analysis_dict)
+    if args.report_url_txt:
-    jdata = json.dumps(analysis_dict)
+        if create_url_txt(url_d, outdir,out_filename):
-    fwjson.write(jdata)
+            logger.info('Successfully created txt url report') 
    fwjson.close()
-    # create html from json
+    if args.report_url_json:
-    htmlout = "%s/%s.html" % (outdir, out_filename)
+        if create_url_json(url_d, outdir,out_filename):
-    fwhtml = open(htmlout,'w')
+            logger.info('Successfully created json url report') 
    #print(jdata)
    html = json2html.convert(json = jdata)
    fwhtml.write(html)
    fwhtml.close()
    # create url savefile
    # print('url_d: ', url_d)
    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
    fwjson = open(jsonurlout, 'w')
    jdata = json.dumps(url_d)
    fwjson.write(jdata)
    fwjson.close()
    txtout = "%s/%s_url.txt" % (outdir, out_filename)
    fwtxt = open(txtout, 'w')
    for k in url_d.keys():
        ddata = url_d[k]
        metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
        fwtxt.write(metatxt)
    fwtxt.close()
    return 42
@@ -504,8 +415,14 @@ def main():
                        help="specify domain or tld to scrape for pdf-files", default=None)
    parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
                        help="specify how many files are searched", default=10, type=int)
-    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
+    parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,help="if the target domain(s) run with old or bad certificates", default=True)
-                        help="if the target domain(s) run with old or bad certificates", default=True)
+                        
    parser.add_argument('-ghr', '--gen-html-report', action='store', dest='gen_html_report', required=False,help="If you want to generate the html report after editing the json outfile (parameter: pdfgrab_analysis.json)")
    parser.add_argument('-rtd', '--report-text-disable', action='store_false', dest='report_txt', required=False,help="Disable txt report",default=True)
    parser.add_argument('-rjd', '--report-json-disable', action='store_false', dest='report_json', required=False,help="Disable json report",default=True)
    parser.add_argument('-rhd', '--report-html-disable', action='store_false', dest='report_html', required=False,help="Disable html report",default=True)
    parser.add_argument('-rutd', '--report-url-text-disable', action='store_false', dest='report_url_txt', required=False,help="Disable url txt report",default=True)
    parser.add_argument('-rujd', '--report-url-json-disable', action='store_false', dest='report_url_json', required=False,help="Disable url json report",default=True)
    if len(sys.argv)<2:
        parser.print_help(sys.stderr)
--- a/supply/pdf.png
+++ b/supply/pdf.png
--- a/supply/pdf_base64.png
+++ b/supply/pdf_base64.png