added decryption function for alg1/2
This commit is contained in:
88
pdfgrab.py
88
pdfgrab.py
@@ -10,10 +10,16 @@
|
|||||||
# TODO
|
# TODO
|
||||||
# * json output
|
# * json output
|
||||||
# * txt output
|
# * txt output
|
||||||
# * catch ssl exceptions
|
|
||||||
# * catch conn refused connections
|
# * catch conn refused connections
|
||||||
# * set option for certificate verification, default is false
|
# * set option for certificate verification, default is false
|
||||||
|
# * complete analyse.txt and seperated
|
||||||
|
# * clean up code
|
||||||
|
# * do more testing
|
||||||
|
# * add random useragent for google and website pdf gathering
|
||||||
|
#
|
||||||
|
# Done
|
||||||
# * add decryption routine
|
# * add decryption routine
|
||||||
|
# * catch ssl exceptions
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -35,6 +41,61 @@ def url_strip(url):
|
|||||||
url = url.rstrip("\r")
|
url = url.rstrip("\r")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def get_DocInfo(filename, filehandle):
|
||||||
|
|
||||||
|
fh = filehandle
|
||||||
|
try:
|
||||||
|
extract = fh.documentInfo
|
||||||
|
except pdf.utils.PdfReadError as e:
|
||||||
|
print('Error: %s' % e)
|
||||||
|
return -1
|
||||||
|
|
||||||
|
print('-'*80)
|
||||||
|
print('File: %s' % filename)
|
||||||
|
for k in extract.keys():
|
||||||
|
edata = '%s %s' % (k,extract[k])
|
||||||
|
print(edata)
|
||||||
|
print
|
||||||
|
print('-'*80)
|
||||||
|
|
||||||
|
|
||||||
|
def decrypt_empty_pdf(filename):
|
||||||
|
|
||||||
|
fr = pdf.PdfFileReader(open(filename,"rb"))
|
||||||
|
try:
|
||||||
|
fr.decrypt('')
|
||||||
|
except NotImplementedError as e:
|
||||||
|
print('Error: %s' % (e))
|
||||||
|
print('Only algorithm code 1 and 2 are supported')
|
||||||
|
return -1
|
||||||
|
return fr
|
||||||
|
|
||||||
|
|
||||||
|
def check_encryption(filename):
|
||||||
|
''' basic function to check if file is encrypted
|
||||||
|
'''
|
||||||
|
|
||||||
|
print(filename)
|
||||||
|
try:
|
||||||
|
fr = pdf.PdfFileReader(open(filename,"rb"))
|
||||||
|
except pdf.utils.PdfReadError as e:
|
||||||
|
print('Error: %s' % e)
|
||||||
|
return -1
|
||||||
|
|
||||||
|
if fr.getIsEncrypted()==True:
|
||||||
|
print('[i] File encrypted %s' % filename)
|
||||||
|
nfr = decrypt_empty_pdf(filename)
|
||||||
|
if nfr != -1:
|
||||||
|
get_DocInfo(filename,nfr)
|
||||||
|
|
||||||
|
else:
|
||||||
|
get_DocInfo(filename,fr)
|
||||||
|
|
||||||
|
#fr.close()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def find_name(pdf):
|
def find_name(pdf):
|
||||||
''' simply parses the urlencoded name and extracts the storage name
|
''' simply parses the urlencoded name and extracts the storage name
|
||||||
i would not be surprised this naive approach can lead to fuckups
|
i would not be surprised this naive approach can lead to fuckups
|
||||||
@@ -76,7 +137,12 @@ def store_pdf(url,data,outdir):
|
|||||||
''' storing the downloaded pdf data '''
|
''' storing the downloaded pdf data '''
|
||||||
name = find_name(url)
|
name = find_name(url)
|
||||||
save = "%s/%s" % (outdir,name)
|
save = "%s/%s" % (outdir,name)
|
||||||
f = open(save,"wb")
|
try:
|
||||||
|
f = open(save,"wb")
|
||||||
|
except OSError as e:
|
||||||
|
print('Error: %s' % (e))
|
||||||
|
return -1
|
||||||
|
|
||||||
ret=f.write(data)
|
ret=f.write(data)
|
||||||
print('[+] Written %d bytes for File: %s' % (ret,save))
|
print('[+] Written %d bytes for File: %s' % (ret,save))
|
||||||
f.close()
|
f.close()
|
||||||
@@ -87,6 +153,9 @@ def store_pdf(url,data,outdir):
|
|||||||
def _parse_pdf(filename):
|
def _parse_pdf(filename):
|
||||||
''' the real parsing function '''
|
''' the real parsing function '''
|
||||||
|
|
||||||
|
check_encryption(filename)
|
||||||
|
return True
|
||||||
|
|
||||||
print('[+] Opening %s' % filename)
|
print('[+] Opening %s' % filename)
|
||||||
pdfile = open(filename,'rb')
|
pdfile = open(filename,'rb')
|
||||||
|
|
||||||
@@ -95,20 +164,9 @@ def _parse_pdf(filename):
|
|||||||
except pdf.utils.PdfReadError as e:
|
except pdf.utils.PdfReadError as e:
|
||||||
print('[-] Error: %s' % (e))
|
print('[-] Error: %s' % (e))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
return pdfile
|
||||||
|
|
||||||
try:
|
|
||||||
extract = h.documentInfo
|
|
||||||
except pdf.utils.PdfReadError as e:
|
|
||||||
print('Error: %s' % e)
|
|
||||||
return -1
|
|
||||||
|
|
||||||
print('-'*80)
|
|
||||||
print('File: %s' % filename)
|
|
||||||
for k in extract.keys():
|
|
||||||
edata = '%s %s' % (k,extract[k])
|
|
||||||
print(edata)
|
|
||||||
print
|
|
||||||
print('-'*80)
|
|
||||||
|
|
||||||
def parse_single_pdf(filename):
|
def parse_single_pdf(filename):
|
||||||
''' single parse function '''
|
''' single parse function '''
|
||||||
|
|||||||
Reference in New Issue
Block a user