release version 0.4.9

2019-11-07 12:51:30 +01:00
parent a89ac93c3d
commit 132750867f
10 changed files with 477 additions and 161 deletions
--- a/libs/libgoogle.py
+++ b/libs/libgoogle.py
@@ -5,19 +5,44 @@ from libs.libhelper import *
 def get_random_agent():
    return (gs.get_random_user_agent())

-def search_pdf(search, args):
+def hits_google(search, args):
+    ''' the function where googlesearch from mario vilas
+		is called
+	'''
+    s = search.split(',')
+    query = 'filetype:pdf'
+
+
+    try:
+        hits = gs.hits(query, domains=s,user_agent=gs.get_random_user_agent())
+
+    except urllib.error.HTTPError as e:
+        return False,e
+
+    except urllib.error.URLError as e:
+        return False,e
+
+    except IndexError as e:
+        return False,e
+
+    return True,hits
+
+
+def search_google(search, args):
    ''' the function where googlesearch from mario vilas
 		is called
 	'''

+    s = search.split(',')
    search_stop = args.search_stop

-    query = '%s filetype:pdf' % search
+    query = 'filetype:pdf'
+    #query = 'site:%s filetype:pdf' % search
    # print(query)
    urls = []

    try:
-        for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
+        for url in gs.search(query, num=20, domains=s,stop=search_stop, user_agent=gs.get_random_user_agent()):
            #print(url)
            urls.append(url)

--- a/libs/liblog.py
+++ b/libs/liblog.py
@@ -8,10 +8,12 @@ file_handler = logging.FileHandler('pdfgrab.log')
 console_handler = logging.StreamHandler()
 console_handler.setLevel(logging.WARNING)

-formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
+file_formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
+console_formatter = logging.Formatter('%(levelname)s:%(message)s')

-file_handler.setFormatter(formatter)
-console_handler.setFormatter(formatter)
+
+file_handler.setFormatter(file_formatter)
+console_handler.setFormatter(console_formatter)

 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
--- a/libs/libreport.py
+++ b/libs/libreport.py
@@ -0,0 +1,173 @@
+import os
+import sys
+import json
+from json2html import * 
+from libs.pdf_png import get_png_base64
+
+def prepare_analysis_dict(ana_queue):
+    '''params: ana_queue - queue with collected information
+    '''
+    # initiate analysis dictionary
+    analysis_dict = {}
+
+    # move analysis dictionary in queue back to dictionary
+    while ana_queue.empty() == False:
+        item = ana_queue.get()
+        # print('item ', item)
+        analysis_dict.update(item)
+
+    # ana_q is empty now return the newly created dictionary
+    return analysis_dict
+
+def create_txt_report(analysis_dict, outdir, out_filename):
+    ''' create a txt report in the output directory
+    '''
+
+    # draw seperator lines
+    sep = '-' * 80 + '\n'
+
+    # create output filepath
+    txtout = "%s/%s.txt" % (outdir, out_filename)
+
+    # open the file and return filedescriptor
+    fwtxt = open(txtout, 'w')
+
+    # get the keys of the dict
+    for k in analysis_dict.keys():
+        # write seperator
+        fwtxt.write(sep)
+
+        # build entry filename of the pdf
+        fname = 'File: %s\n' % (analysis_dict[k]['filename'])
+
+        # build data entry
+        ddata = analysis_dict[k]['data']
+
+        # write the filename
+        fwtxt.write(fname)
+
+        # write the metadata
+        for kdata in ddata.keys():
+            metatxt = '%s:%s\n' % (kdata, ddata[kdata])
+            fwtxt.write(metatxt)
+
+        # write seperator
+        fwtxt.write(sep)
+
+    # close the file
+    fwtxt.close()
+
+    return True
+
+def create_json_report(analysis_dict, outdir, out_filename):
+    ''' create a jsonfile report in the output directory
+    '''
+
+    # build json output name
+    jsonout = "%s/%s.json" % (outdir, out_filename)
+
+    # open up json output file
+    fwjson = open(jsonout, 'w')
+
+    # convert dictionary to json data
+    jdata = json.dumps(analysis_dict)
+
+    # write json data to file 
+    fwjson.write(jdata)
+
+    # close file
+    fwjson.close()
+
+    return True
+
+def create_html_report(analysis_dict, outdir, out_filename):
+    ''' create a html report from json file using json2html in the output directory
+    '''
+
+    # build up path for html output file
+    htmlout = "%s/%s.html" % (outdir, out_filename)
+
+    # open htmlout filedescriptor
+    fwhtml = open(htmlout,'w')
+
+    # some html stuff
+    pdfpng=get_png_base64('supply/pdf_base64.png')
+    html_style ='<style>.center { display: block; margin-left: auto;margin-right: auto;} table {border-collapse: collapse;} th, td { border: 1px solid black;text-align: left; }</style>\n'
+    html_head = '<html><head><title>pdfgrab - {0} item/s</title>{1}</head>\n'.format(len(analysis_dict),html_style)
+    html_pdf_png = '<p class="center"><img class="center" src="data:image/jpeg;base64,{0}"><br><center>pdfgrab - grab and analyse pdf files</center><br></p>'.format(pdfpng)
+    html_body = '<body>{0}\n'.format(html_pdf_png)
+    html_end = '\n<br><br><p align="center"><a href="https://github.com/c0decave/pdfgrab">pdfgrab</a> by <a href="https://twitter.com/User_to_Root">dash</a></p></body></html>\n'
+
+    # some attributes
+    attr = 'id="meta-data" class="table table-bordered table-hover", border=1, cellpadding=3 summary="Metadata"'
+
+    # convert dictionary to json data
+    # in this mode each finding gets its own table there are other possibilities
+    # but now i go with this
+    html_out = ''
+    for k in analysis_dict.keys():
+        trans = analysis_dict[k]
+        jdata = json.dumps(trans)
+        html = json2html.convert(json = jdata, table_attributes=attr)
+        html_out = html_out + html + "\n"
+        #html_out = html_out + "<p>" + html + "</p>\n"
+    #jdata = json.dumps(analysis_dict)
+
+    # create html
+    #html = json2html.convert(json = jdata, table_attributes=attr)
+
+    # write html
+    fwhtml.write(html_head)
+    fwhtml.write(html_body)
+    fwhtml.write(html_out)
+    fwhtml.write(html_end)
+
+    # close html file
+    fwhtml.close()
+    
+def create_url_json(url_d, outdir, out_filename):
+    ''' create a json url file in output directory
+    '''
+
+    # create url savefile
+    jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
+
+    # open up file for writting urls down
+    fwjson = open(jsonurlout, 'w')
+
+    # convert url dictionary to json
+    jdata = json.dumps(url_d)
+
+    # write json data to file
+    fwjson.write(jdata)
+
+    # close filedescriptor
+    fwjson.close()
+
+    return True
+
+def create_url_txt(url_d, outdir, out_filename):
+    ''' create a txt url file in output directory
+    '''
+    # build up txt out path
+    txtout = "%s/%s_url.txt" % (outdir, out_filename)
+
+    # open up our url txtfile
+    fwtxt = open(txtout, 'w')
+
+    # iterating through the keys of the url dictionary
+    for k in url_d.keys():
+
+        # get the entry
+        ddata = url_d[k]
+
+        # create meta data for saving
+        metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
+
+        # write metadata to file
+        fwtxt.write(metatxt)
+
+    # close fd
+    fwtxt.close()
+
+    return True
--- a/libs/librequest.py
+++ b/libs/librequest.py
@@ -0,0 +1,162 @@
+import os
+import sys
+import json
+import socket
+import requests
+
+from libs.liblog import logger
+from libs.libhelper import *
+from libs.libgoogle import get_random_agent
+
+def store_file(url, data, outdir):
+    ''' storing the downloaded data to a file
+        params: url     - is used to create the filename
+                data    - the data of the file
+                outdir  - to store in which directory
+                returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
+    '''
+
+    logger.info('Store file {0}'.format(url))
+    name = find_name(url)
+
+    # only allow stored file a name with 50 chars
+    if len(name) > 50:
+        name = name[:49]
+
+    # build up the save path
+    save = "%s/%s" % (outdir, name)
+
+    try:
+        f = open(save, "wb")
+
+    except OSError as e:
+        logger.warning('store_file {0}'.format(e))
+        # return ret_dict
+        return {"code":False,"data":save,"error":e}
+
+    # write the data and return the written bytes
+    ret = f.write(data)
+
+    # check if bytes are zero
+    if ret == 0:
+        logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
+
+    else:
+        # log to info that bytes and file has been written
+        logger.info('Written {0} bytes for file: {1}'.format(ret,save))
+
+    # close file descriptor
+    f.close()
+
+    # return ret_dict
+    return {"code":True,"data":save,"error":False}
+
+
+def download_file(url, args, header_data):
+    ''' downloading the file for later analysis 
+        params: url         - the url
+                args        - argparse args namespace
+                header_data - pre-defined header data
+        returns: ret_dict
+    '''
+
+    # check the remote tls certificate or not?
+    cert_check = args.cert_check
+
+    # run our try catch routine
+    try:
+        # request the url and save the response in req
+        # give header data and set verify as delivered by args.cert_check
+        req = requests.get(url, headers=header_data, verify=cert_check)
+
+    except requests.exceptions.SSLError as e:
+        logger.warning('download file {0}{1}'.format(url,e))
+
+        # return retdict
+        return {"code":False,"data":req,"error":e}
+
+    except requests.exceptions.InvalidSchema as e:
+        logger.warning('download file {0}{1}'.format(url,e))
+
+        # return retdict
+        return {"code":False,"data":False,"error":e}
+
+    except socket.gaierror as e:
+        logger.warning('download file, host not known {0} {1}'.format(url,e))
+        return {"code":False,"data":False,"error":e}
+
+    except:
+        logger.warning('download file, something wrong with remote server? {0}'.format(url))
+        # return retdict
+        if not req in locals():
+            req = False
+
+        return {"code":False,"data":req,"error":True}
+
+    #finally:
+        # lets close the socket
+        #req.close()
+
+    # return retdict
+    return {"code":True,"data":req,"error":False}
+
+def grab_run(url, args, outdir):
+    ''' function keeping all the steps for the user call of grabbing
+	just one and analysing it
+    '''
+    header_data = {'User-Agent': get_random_agent()}
+    rd_download = download_file(url, args, header_data)
+    code_down = rd_download['code']
+
+    # is code True download of file was successfull
+    if code_down:
+        rd_evaluate = evaluate_response(rd_download)
+        code_eval = rd_evaluate['code']
+        # if code is True, evaluation was also successful
+        if code_eval:
+            # get the content from the evaluate dictionary request
+            content = rd_evaluate['data'].content
+
+            # call store file 
+            rd_store = store_file(url, content, outdir)
+
+            # get the code
+            code_store = rd_store['code']
+
+            # get the savepath
+            savepath = rd_store['data']
+
+            # if code is True, storing of file was also successfull
+            if code_store:
+                return {"code":True,"data":savepath,"error":False}
+
+    return {"code":False,"data":False,"error":True}
+
+def evalute_content(ret_dict):
+    pass
+
+def evaluate_response(ret_dict):
+    ''' this method comes usually after download_file,
+        it will evaluate what has happened and if we even have some data to process
+        or not
+        params: data    - is the req object from the conducted request
+        return: {}
+        returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
+        '''
+    # extract data from ret_dict
+    req = ret_dict['data']
+
+    # get status code
+    url = req.url
+    status = req.status_code
+    reason = req.reason
+
+    # ahh everything is fine 
+    if status == 200:
+        logger.info('download file, {0} {1} {2}'.format(url,reason,status))
+        return {"code":True,"data":req,"error":False}
+
+    # nah something is not like it should be
+    else:
+        logger.warning('download file, {0} {1} {2}'.format(url,reason,status))
+        return {"code":False,"data":req,"error":True}
--- a/libs/pdf_png.py
+++ b/libs/pdf_png.py
@@ -0,0 +1,5 @@
+
+def get_png_base64(filename):
+    fr = open(filename,'r')
+    buf = fr.read()
+    return buf