bugfix pre-release 0.4.8

This commit is contained in:
c0decave
2019-11-06 12:49:54 +01:00
parent 4f63e62690
commit a89ac93c3d
5 changed files with 99 additions and 30 deletions

View File

@@ -1,6 +1,6 @@
# pdfgrab # pdfgrab
* Version 0.4.7 * Version 0.4.8-Pre
## What is it? ## What is it?
@@ -9,6 +9,14 @@ Basically it analyses PDF files for Metadata. You can direct it to a file or dir
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class) You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
to search for pdfs at target site, download and analyse them. to search for pdfs at target site, download and analyse them.
## What is new in 0.4.8 bug fix pre-release?
* catching google error at too many requests
* catching dns resolve urlopen error at googlelib
* fixing annoying bug in regard of pdfs behind urls like http://host/pdf/
* fixing zero size pdf error(online linked pdfs which are not accessable)
* added some logging
## What is new in 0.4.7 release? ## What is new in 0.4.7 release?
* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file * Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file

View File

@@ -1,6 +1,15 @@
Changelog Changelog
========= =========
Version 4.8 Bugfix-PreRelease
-----------------------------
* catching google to many requests
* catching urlopen dns not resolveable error
* fixing nasty bug in store_pdf/find_name
* fixing zero size pdf error
* extra logging
Version 4.7 Version 4.7
----------- -----------

View File

@@ -22,9 +22,12 @@ def search_pdf(search, args):
urls.append(url) urls.append(url)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
print('Error: %s' % e) #print('Error: %s' % e)
return -1 return False,e
except urllib.error.URLError as e:
return False,e
return urls return True,urls

View File

@@ -2,6 +2,14 @@ import os
import sys import sys
from Crypto.Hash import SHA256 from Crypto.Hash import SHA256
def check_file_size(filename):
''' simply check if byte size is bigger than 0 bytes
'''
fstat = os.stat(filename)
if fstat.st_size == 0:
return False
return True
def make_directory(outdir): def make_directory(outdir):
''' naive mkdir function ''' ''' naive mkdir function '''
try: try:
@@ -27,11 +35,21 @@ def find_name(pdf):
i would not be surprised this naive approach can lead to fuckups i would not be surprised this naive approach can lead to fuckups
''' '''
name = ''
# find the name of the file # find the name of the file
name = pdf.split("/") #
a = len(name) name_list = pdf.split("/")
name = name[a - 1] len_list = len(name)
# print(name) # ugly magic ;-)
# what happens is, that files can also be behind urls like:
# http://host/pdf/
# so splitting up the url and always going with the last item after slash
# can result in that case in an empty name, so we go another field in the list back
# and use this as the name
if name_list[len_list - 1] == '':
name = name_list[len_list - 2]
else:
name = name_list[len_list - 1]
return name return name

View File

@@ -27,9 +27,9 @@ from IPython import embed
# some variables in regard of the tool itself # some variables in regard of the tool itself
name = 'pdfgrab' name = 'pdfgrab'
version = '0.4.7' version = '0.4.8-Pre'
author = 'dash' author = 'dash'
date = '2019' date = 'November 2019'
# queues for processing # queues for processing
# this queue holds the URL locations of files to download # this queue holds the URL locations of files to download
@@ -114,7 +114,7 @@ def get_xmp_meta_data(filename, filehandle):
xmp_meta = fh.getXmpMetadata() xmp_meta = fh.getXmpMetadata()
except xml.parsers.expat.ExpatError as e: except xml.parsers.expat.ExpatError as e:
print('Error: %s' % e) logger.warning('get_xmp_meta_data error {0}'.format(e))
err_dict = {'error': str(e)} err_dict = {'error': str(e)}
return -1 return -1
@@ -122,8 +122,15 @@ def get_xmp_meta_data(filename, filehandle):
process_queue_data(filename, err_dict, 'doc_xmp_info') process_queue_data(filename, err_dict, 'doc_xmp_info')
if xmp_meta != None: if xmp_meta != None:
try:
print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject)) print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
xmp_dict = {} #print('xmp_meta cache: {0}'.format(xmp_meta.cache))
#print('xmp_meta custom properties: {0}'.format(xmp_meta.custom_properties))
#embed()
except AttributeError as e:
logger.warning('xmp_meta print {0}'.format(e))
return False
return xmp_dict return xmp_dict
@@ -152,12 +159,12 @@ def get_DocInfo(filename, filehandle):
extract = fh.documentInfo extract = fh.documentInfo
except pdf.utils.PdfReadError as e: except pdf.utils.PdfReadError as e:
print('Error: %s' % e) logger.warning('get_doc_info {0}'.format(e))
err_dict = {'error': str(e)} err_dict = {'error': str(e)}
return -1 return -1
except PyPDF2.utils.PdfReadError as e: except PyPDF2.utils.PdfReadError as e:
print('Error: %s' % e) logger.warning('get_doc_info {0}'.format(e))
err_dict = {'error': str(e)} err_dict = {'error': str(e)}
return -1 return -1
@@ -185,7 +192,7 @@ def get_DocInfo(filename, filehandle):
print('-' * 80) print('-' * 80)
except PyPDF2.utils.PdfReadError as e: except PyPDF2.utils.PdfReadError as e:
print('Error: %s' % e) logger.warning('get_doc_info {0}'.format(e))
err_dict = {'error': str(e)} err_dict = {'error': str(e)}
process_queue_data(filename, err_dict, 'doc_info') process_queue_data(filename, err_dict, 'doc_info')
return -1 return -1
@@ -204,8 +211,7 @@ def decrypt_empty_pdf(filename):
fr.decrypt('') fr.decrypt('')
except NotImplementedError as e: except NotImplementedError as e:
# print('Error: %s' % (e)) logger.warning('decrypt_empty_pdf {0}{1}'.format(filename,e))
print('Error: File: %s encrypted. %s' % (filename, str(e)))
return -1 return -1
return fr return fr
@@ -214,11 +220,12 @@ def check_encryption(filename):
''' basic function to check if file is encrypted ''' basic function to check if file is encrypted
''' '''
# print(filename) print(filename)
try: try:
fr = pdf.PdfFileReader(open(filename, "rb")) fr = pdf.PdfFileReader(open(filename, "rb"))
print(fr)
except pdf.utils.PdfReadError as e: except pdf.utils.PdfReadError as e:
print('Error: %s' % e) logger.warning('check encryption {0}'.format(e))
return -1 return -1
if fr.getIsEncrypted() == True: if fr.getIsEncrypted() == True:
@@ -250,15 +257,15 @@ def download_pdf(url, args, header_data):
status_code = req.status_code status_code = req.status_code
except requests.exceptions.SSLError as e: except requests.exceptions.SSLError as e:
print('Error: %s' % e) logger.warning('download pdf {0}{1}'.format(url,e))
return -1 return -1
except: except:
print('Error: Probably something wrong with remote server') logger.warning('download pdf, something wrong with remote server? {0}'.format(url))
return -1 return -1
if status_code == 403: if status_code == 403:
print('%s http/403 Forbidden' % (url)) logger.warning('download pdf, 403 Forbidden {0}'.format(url))
return -1 return -1
# print(len(data)) # print(len(data))
@@ -269,8 +276,11 @@ def store_pdf(url, data, outdir):
''' storing the downloaded pdf data ''' storing the downloaded pdf data
''' '''
logger.info('Store pdf') logger.info('Store pdf {0}'.format(url))
name = find_name(url) name = find_name(url)
#logger.warning(url)
#logger.warning(name)
#logger.warning(outdir)
# only allow stored file a name with 50 chars # only allow stored file a name with 50 chars
if len(name) > 50: if len(name) > 50:
@@ -282,13 +292,19 @@ def store_pdf(url, data, outdir):
try: try:
f = open(save, "wb") f = open(save, "wb")
except OSError as e: except OSError as e:
print('Error: %s' % (e)) logger.warning('store_pdf {0}'.format(e))
return -1 return -1
ret = f.write(data) ret = f.write(data)
logger.info('Written {0} bytes for file: {1}'.format(ret,save)) logger.info('Written {0} bytes for file: {1}'.format(ret,save))
f.close() f.close()
if ret == 0:
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
return save
#return -1
# return the savepath # return the savepath
return save return save
@@ -296,8 +312,13 @@ def store_pdf(url, data, outdir):
def _parse_pdf(filename): def _parse_pdf(filename):
''' the real parsing function ''' ''' the real parsing function '''
logger.warning('{0}'.format(filename))
if check_file_size(filename):
ret = check_encryption(filename) ret = check_encryption(filename)
return ret return ret
else:
logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
return False
def grab_url(url, args, outdir): def grab_url(url, args, outdir):
@@ -318,8 +339,16 @@ def seek_and_analyse(search, args, outdir):
them together them together
''' '''
# use the search function of googlesearch to get the results # use the search function of googlesearch to get the results
urls=search_pdf(search, args) code, values=search_pdf(search, args)
for item in urls: if not code:
if values.code == 429:
logger.warning('[-] Too many requests, time to change ip address or use proxychains')
else:
logger.warning('Google returned error {0}'.format(values))
return -1
for item in values:
filename = find_name(item) filename = find_name(item)
process_queue_data(filename, item, 'url') process_queue_data(filename, item, 'url')
@@ -335,6 +364,7 @@ def seek_and_analyse(search, args, outdir):
url = item['url'] url = item['url']
grab_url(url, args, outdir) grab_url(url, args, outdir)
return True
def run(args): def run(args):
@@ -369,7 +399,8 @@ def run(args):
elif args.search: elif args.search:
search = args.search search = args.search
logger.info('Seek and analyse {0}'.format(search)) logger.info('Seek and analyse {0}'.format(search))
seek_and_analyse(search, args, outdir) if not seek_and_analyse(search, args, outdir):
return -1
elif args.files_dir: elif args.files_dir:
directory = args.files_dir directory = args.files_dir