bugfix pre-release 0.4.8
This commit is contained in:
10
Readme.md
10
Readme.md
@@ -1,6 +1,6 @@
|
|||||||
# pdfgrab
|
# pdfgrab
|
||||||
|
|
||||||
* Version 0.4.7
|
* Version 0.4.8-Pre
|
||||||
|
|
||||||
## What is it?
|
## What is it?
|
||||||
|
|
||||||
@@ -9,6 +9,14 @@ Basically it analyses PDF files for Metadata. You can direct it to a file or dir
|
|||||||
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
|
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas class)
|
||||||
to search for pdfs at target site, download and analyse them.
|
to search for pdfs at target site, download and analyse them.
|
||||||
|
|
||||||
|
## What is new in 0.4.8 bug fix pre-release?
|
||||||
|
|
||||||
|
* catching google error at too many requests
|
||||||
|
* catching dns resolve urlopen error at googlelib
|
||||||
|
* fixing annoying bug in regard of pdfs behind urls like http://host/pdf/
|
||||||
|
* fixing zero size pdf error(online linked pdfs which are not accessable)
|
||||||
|
* added some logging
|
||||||
|
|
||||||
## What is new in 0.4.7 release?
|
## What is new in 0.4.7 release?
|
||||||
|
|
||||||
* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file
|
* Added support for html output file, this will be placed in the outdir path and is more clear then a text or json file
|
||||||
|
|||||||
@@ -1,6 +1,15 @@
|
|||||||
Changelog
|
Changelog
|
||||||
=========
|
=========
|
||||||
|
|
||||||
|
Version 4.8 Bugfix-PreRelease
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
* catching google to many requests
|
||||||
|
* catching urlopen dns not resolveable error
|
||||||
|
* fixing nasty bug in store_pdf/find_name
|
||||||
|
* fixing zero size pdf error
|
||||||
|
* extra logging
|
||||||
|
|
||||||
Version 4.7
|
Version 4.7
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
|
|||||||
@@ -22,9 +22,12 @@ def search_pdf(search, args):
|
|||||||
urls.append(url)
|
urls.append(url)
|
||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
print('Error: %s' % e)
|
#print('Error: %s' % e)
|
||||||
return -1
|
return False,e
|
||||||
|
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
return False,e
|
||||||
|
|
||||||
|
|
||||||
return urls
|
return True,urls
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,14 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from Crypto.Hash import SHA256
|
from Crypto.Hash import SHA256
|
||||||
|
|
||||||
|
def check_file_size(filename):
|
||||||
|
''' simply check if byte size is bigger than 0 bytes
|
||||||
|
'''
|
||||||
|
fstat = os.stat(filename)
|
||||||
|
if fstat.st_size == 0:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def make_directory(outdir):
|
def make_directory(outdir):
|
||||||
''' naive mkdir function '''
|
''' naive mkdir function '''
|
||||||
try:
|
try:
|
||||||
@@ -27,11 +35,21 @@ def find_name(pdf):
|
|||||||
i would not be surprised this naive approach can lead to fuckups
|
i would not be surprised this naive approach can lead to fuckups
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
name = ''
|
||||||
# find the name of the file
|
# find the name of the file
|
||||||
name = pdf.split("/")
|
#
|
||||||
a = len(name)
|
name_list = pdf.split("/")
|
||||||
name = name[a - 1]
|
len_list = len(name)
|
||||||
# print(name)
|
# ugly magic ;-)
|
||||||
|
# what happens is, that files can also be behind urls like:
|
||||||
|
# http://host/pdf/
|
||||||
|
# so splitting up the url and always going with the last item after slash
|
||||||
|
# can result in that case in an empty name, so we go another field in the list back
|
||||||
|
# and use this as the name
|
||||||
|
if name_list[len_list - 1] == '':
|
||||||
|
name = name_list[len_list - 2]
|
||||||
|
else:
|
||||||
|
name = name_list[len_list - 1]
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|||||||
69
pdfgrab.py
69
pdfgrab.py
@@ -27,9 +27,9 @@ from IPython import embed
|
|||||||
|
|
||||||
# some variables in regard of the tool itself
|
# some variables in regard of the tool itself
|
||||||
name = 'pdfgrab'
|
name = 'pdfgrab'
|
||||||
version = '0.4.7'
|
version = '0.4.8-Pre'
|
||||||
author = 'dash'
|
author = 'dash'
|
||||||
date = '2019'
|
date = 'November 2019'
|
||||||
|
|
||||||
# queues for processing
|
# queues for processing
|
||||||
# this queue holds the URL locations of files to download
|
# this queue holds the URL locations of files to download
|
||||||
@@ -114,7 +114,7 @@ def get_xmp_meta_data(filename, filehandle):
|
|||||||
xmp_meta = fh.getXmpMetadata()
|
xmp_meta = fh.getXmpMetadata()
|
||||||
|
|
||||||
except xml.parsers.expat.ExpatError as e:
|
except xml.parsers.expat.ExpatError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('get_xmp_meta_data error {0}'.format(e))
|
||||||
err_dict = {'error': str(e)}
|
err_dict = {'error': str(e)}
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
@@ -122,8 +122,15 @@ def get_xmp_meta_data(filename, filehandle):
|
|||||||
process_queue_data(filename, err_dict, 'doc_xmp_info')
|
process_queue_data(filename, err_dict, 'doc_xmp_info')
|
||||||
|
|
||||||
if xmp_meta != None:
|
if xmp_meta != None:
|
||||||
|
try:
|
||||||
|
|
||||||
print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
|
print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
|
||||||
xmp_dict = {}
|
#print('xmp_meta cache: {0}'.format(xmp_meta.cache))
|
||||||
|
#print('xmp_meta custom properties: {0}'.format(xmp_meta.custom_properties))
|
||||||
|
#embed()
|
||||||
|
except AttributeError as e:
|
||||||
|
logger.warning('xmp_meta print {0}'.format(e))
|
||||||
|
return False
|
||||||
|
|
||||||
return xmp_dict
|
return xmp_dict
|
||||||
|
|
||||||
@@ -152,12 +159,12 @@ def get_DocInfo(filename, filehandle):
|
|||||||
extract = fh.documentInfo
|
extract = fh.documentInfo
|
||||||
|
|
||||||
except pdf.utils.PdfReadError as e:
|
except pdf.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('get_doc_info {0}'.format(e))
|
||||||
err_dict = {'error': str(e)}
|
err_dict = {'error': str(e)}
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
except PyPDF2.utils.PdfReadError as e:
|
except PyPDF2.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('get_doc_info {0}'.format(e))
|
||||||
err_dict = {'error': str(e)}
|
err_dict = {'error': str(e)}
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
@@ -185,7 +192,7 @@ def get_DocInfo(filename, filehandle):
|
|||||||
print('-' * 80)
|
print('-' * 80)
|
||||||
|
|
||||||
except PyPDF2.utils.PdfReadError as e:
|
except PyPDF2.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('get_doc_info {0}'.format(e))
|
||||||
err_dict = {'error': str(e)}
|
err_dict = {'error': str(e)}
|
||||||
process_queue_data(filename, err_dict, 'doc_info')
|
process_queue_data(filename, err_dict, 'doc_info')
|
||||||
return -1
|
return -1
|
||||||
@@ -204,8 +211,7 @@ def decrypt_empty_pdf(filename):
|
|||||||
fr.decrypt('')
|
fr.decrypt('')
|
||||||
|
|
||||||
except NotImplementedError as e:
|
except NotImplementedError as e:
|
||||||
# print('Error: %s' % (e))
|
logger.warning('decrypt_empty_pdf {0}{1}'.format(filename,e))
|
||||||
print('Error: File: %s encrypted. %s' % (filename, str(e)))
|
|
||||||
return -1
|
return -1
|
||||||
return fr
|
return fr
|
||||||
|
|
||||||
@@ -214,11 +220,12 @@ def check_encryption(filename):
|
|||||||
''' basic function to check if file is encrypted
|
''' basic function to check if file is encrypted
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# print(filename)
|
print(filename)
|
||||||
try:
|
try:
|
||||||
fr = pdf.PdfFileReader(open(filename, "rb"))
|
fr = pdf.PdfFileReader(open(filename, "rb"))
|
||||||
|
print(fr)
|
||||||
except pdf.utils.PdfReadError as e:
|
except pdf.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('check encryption {0}'.format(e))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
if fr.getIsEncrypted() == True:
|
if fr.getIsEncrypted() == True:
|
||||||
@@ -250,15 +257,15 @@ def download_pdf(url, args, header_data):
|
|||||||
status_code = req.status_code
|
status_code = req.status_code
|
||||||
|
|
||||||
except requests.exceptions.SSLError as e:
|
except requests.exceptions.SSLError as e:
|
||||||
print('Error: %s' % e)
|
logger.warning('download pdf {0}{1}'.format(url,e))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
except:
|
except:
|
||||||
print('Error: Probably something wrong with remote server')
|
logger.warning('download pdf, something wrong with remote server? {0}'.format(url))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
if status_code == 403:
|
if status_code == 403:
|
||||||
print('%s http/403 Forbidden' % (url))
|
logger.warning('download pdf, 403 Forbidden {0}'.format(url))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
# print(len(data))
|
# print(len(data))
|
||||||
@@ -269,8 +276,11 @@ def store_pdf(url, data, outdir):
|
|||||||
''' storing the downloaded pdf data
|
''' storing the downloaded pdf data
|
||||||
'''
|
'''
|
||||||
|
|
||||||
logger.info('Store pdf')
|
logger.info('Store pdf {0}'.format(url))
|
||||||
name = find_name(url)
|
name = find_name(url)
|
||||||
|
#logger.warning(url)
|
||||||
|
#logger.warning(name)
|
||||||
|
#logger.warning(outdir)
|
||||||
|
|
||||||
# only allow stored file a name with 50 chars
|
# only allow stored file a name with 50 chars
|
||||||
if len(name) > 50:
|
if len(name) > 50:
|
||||||
@@ -282,13 +292,19 @@ def store_pdf(url, data, outdir):
|
|||||||
try:
|
try:
|
||||||
f = open(save, "wb")
|
f = open(save, "wb")
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
print('Error: %s' % (e))
|
logger.warning('store_pdf {0}'.format(e))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
ret = f.write(data)
|
ret = f.write(data)
|
||||||
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
|
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
if ret == 0:
|
||||||
|
logger.warning('Written {0} bytes for file: {1}'.format(ret,save))
|
||||||
|
return save
|
||||||
|
#return -1
|
||||||
|
|
||||||
|
|
||||||
# return the savepath
|
# return the savepath
|
||||||
return save
|
return save
|
||||||
|
|
||||||
@@ -296,8 +312,13 @@ def store_pdf(url, data, outdir):
|
|||||||
def _parse_pdf(filename):
|
def _parse_pdf(filename):
|
||||||
''' the real parsing function '''
|
''' the real parsing function '''
|
||||||
|
|
||||||
|
logger.warning('{0}'.format(filename))
|
||||||
|
if check_file_size(filename):
|
||||||
ret = check_encryption(filename)
|
ret = check_encryption(filename)
|
||||||
return ret
|
return ret
|
||||||
|
else:
|
||||||
|
logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def grab_url(url, args, outdir):
|
def grab_url(url, args, outdir):
|
||||||
@@ -318,8 +339,16 @@ def seek_and_analyse(search, args, outdir):
|
|||||||
them together
|
them together
|
||||||
'''
|
'''
|
||||||
# use the search function of googlesearch to get the results
|
# use the search function of googlesearch to get the results
|
||||||
urls=search_pdf(search, args)
|
code, values=search_pdf(search, args)
|
||||||
for item in urls:
|
if not code:
|
||||||
|
if values.code == 429:
|
||||||
|
logger.warning('[-] Too many requests, time to change ip address or use proxychains')
|
||||||
|
else:
|
||||||
|
logger.warning('Google returned error {0}'.format(values))
|
||||||
|
|
||||||
|
return -1
|
||||||
|
|
||||||
|
for item in values:
|
||||||
filename = find_name(item)
|
filename = find_name(item)
|
||||||
process_queue_data(filename, item, 'url')
|
process_queue_data(filename, item, 'url')
|
||||||
|
|
||||||
@@ -335,6 +364,7 @@ def seek_and_analyse(search, args, outdir):
|
|||||||
url = item['url']
|
url = item['url']
|
||||||
grab_url(url, args, outdir)
|
grab_url(url, args, outdir)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def run(args):
|
def run(args):
|
||||||
@@ -369,7 +399,8 @@ def run(args):
|
|||||||
elif args.search:
|
elif args.search:
|
||||||
search = args.search
|
search = args.search
|
||||||
logger.info('Seek and analyse {0}'.format(search))
|
logger.info('Seek and analyse {0}'.format(search))
|
||||||
seek_and_analyse(search, args, outdir)
|
if not seek_and_analyse(search, args, outdir):
|
||||||
|
return -1
|
||||||
|
|
||||||
elif args.files_dir:
|
elif args.files_dir:
|
||||||
directory = args.files_dir
|
directory = args.files_dir
|
||||||
|
|||||||
Reference in New Issue
Block a user