Files
pdfgrab/libs/librequest.py
2019-11-07 12:51:30 +01:00

163 lines
4.9 KiB
Python

import os
import sys
import json
import socket
import requests
from libs.liblog import logger
from libs.libhelper import *
from libs.libgoogle import get_random_agent
def store_file(url, data, outdir):
''' storing the downloaded data to a file
params: url - is used to create the filename
data - the data of the file
outdir - to store in which directory
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
'''
logger.info('Store file {0}'.format(url))
name = find_name(url)
# only allow stored file a name with 50 chars
if len(name) > 50:
name = name[:49]
# build up the save path
save = "%s/%s" % (outdir, name)
try:
f = open(save, "wb")
except OSError as e:
logger.warning('store_file {0}'.format(e))
# return ret_dict
return {"code":False,"data":save,"error":e}
# write the data and return the written bytes
ret = f.write(data)
# check if bytes are zero
if ret == 0:
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
else:
# log to info that bytes and file has been written
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
# close file descriptor
f.close()
# return ret_dict
return {"code":True,"data":save,"error":False}
def download_file(url, args, header_data):
''' downloading the file for later analysis
params: url - the url
args - argparse args namespace
header_data - pre-defined header data
returns: ret_dict
'''
# check the remote tls certificate or not?
cert_check = args.cert_check
# run our try catch routine
try:
# request the url and save the response in req
# give header data and set verify as delivered by args.cert_check
req = requests.get(url, headers=header_data, verify=cert_check)
except requests.exceptions.SSLError as e:
logger.warning('download file {0}{1}'.format(url,e))
# return retdict
return {"code":False,"data":req,"error":e}
except requests.exceptions.InvalidSchema as e:
logger.warning('download file {0}{1}'.format(url,e))
# return retdict
return {"code":False,"data":False,"error":e}
except socket.gaierror as e:
logger.warning('download file, host not known {0} {1}'.format(url,e))
return {"code":False,"data":False,"error":e}
except:
logger.warning('download file, something wrong with remote server? {0}'.format(url))
# return retdict
if not req in locals():
req = False
return {"code":False,"data":req,"error":True}
#finally:
# lets close the socket
#req.close()
# return retdict
return {"code":True,"data":req,"error":False}
def grab_run(url, args, outdir):
''' function keeping all the steps for the user call of grabbing
just one and analysing it
'''
header_data = {'User-Agent': get_random_agent()}
rd_download = download_file(url, args, header_data)
code_down = rd_download['code']
# is code True download of file was successfull
if code_down:
rd_evaluate = evaluate_response(rd_download)
code_eval = rd_evaluate['code']
# if code is True, evaluation was also successful
if code_eval:
# get the content from the evaluate dictionary request
content = rd_evaluate['data'].content
# call store file
rd_store = store_file(url, content, outdir)
# get the code
code_store = rd_store['code']
# get the savepath
savepath = rd_store['data']
# if code is True, storing of file was also successfull
if code_store:
return {"code":True,"data":savepath,"error":False}
return {"code":False,"data":False,"error":True}
def evalute_content(ret_dict):
pass
def evaluate_response(ret_dict):
''' this method comes usually after download_file,
it will evaluate what has happened and if we even have some data to process
or not
params: data - is the req object from the conducted request
return: {}
returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
'''
# extract data from ret_dict
req = ret_dict['data']
# get status code
url = req.url
status = req.status_code
reason = req.reason
# ahh everything is fine
if status == 200:
logger.info('download file, {0} {1} {2}'.format(url,reason,status))
return {"code":True,"data":req,"error":False}
# nah something is not like it should be
else:
logger.warning('download file, {0} {1} {2}'.format(url,reason,status))
return {"code":False,"data":req,"error":True}