several changes,new features,bugfixes
This commit is contained in:
277
pdfgrab.py
277
pdfgrab.py
@@ -7,41 +7,127 @@
|
||||
# by dash at the end of September 2019
|
||||
#
|
||||
# TODO
|
||||
# * json file output
|
||||
# * txt file output
|
||||
# * complete analyse.txt and seperated
|
||||
# * add complete path in output as well as url where pdf came from
|
||||
# -> if url not exist like -F mode, then the local path
|
||||
# * clean up code
|
||||
# * do more testing
|
||||
# * fine tune google search
|
||||
# * add random timeout for new requests
|
||||
# -> maybe not necessary, gs has it ...
|
||||
# -> sort of necessary, on the other hand use proxychains man
|
||||
# * uh oh some fancy c0l0rs
|
||||
# * catch filename to long thingy
|
||||
# * add thread support
|
||||
# * add scrape mode, to search for pdfs at the website itself
|
||||
# * add current error conditions to logfile
|
||||
#
|
||||
# Done
|
||||
# * add url list to output
|
||||
# * queues added, but no thread support yet
|
||||
# * json file output
|
||||
# * txt file output
|
||||
# * outfilename hardcoded
|
||||
# * add decryption routine
|
||||
# * catch ssl exceptions
|
||||
# * add random useragent for google and website pdf gathering
|
||||
# * set option for certificate verification, default is true
|
||||
# * catch conn refused connections
|
||||
# * catch filename to long thingy
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import queue
|
||||
import urllib
|
||||
import argparse
|
||||
import requests
|
||||
import urllib
|
||||
|
||||
# remove somewhen ;)
|
||||
from IPython import embed
|
||||
|
||||
from PyPDF2 import pdf
|
||||
import PyPDF2
|
||||
from Crypto.Hash import SHA256
|
||||
from collections import deque
|
||||
|
||||
# googlesearch library
|
||||
import googlesearch as gs
|
||||
|
||||
# some variables in regard of the tool itself
|
||||
name = 'pdfgrab'
|
||||
version = '0.4'
|
||||
version = '0.4.4'
|
||||
author = 'dash'
|
||||
date = '2019'
|
||||
|
||||
# queues for processing
|
||||
# this queue holds the URL locations of files to download
|
||||
url_q = queue.Queue()
|
||||
url_d = {}
|
||||
|
||||
# this queue holds the paths of files to analyse
|
||||
pdf_q = queue.Queue()
|
||||
|
||||
# this is the analysis queue, keeping the data for further processing
|
||||
ana_q = queue.Queue()
|
||||
|
||||
def create_sha256(hdata):
|
||||
''' introduced to create hashes of filenames, to have a uniqid
|
||||
of course hashes of the file itself will be the next topic
|
||||
'''
|
||||
hobject = SHA256.new(data=hdata.encode())
|
||||
return (hobject.hexdigest())
|
||||
|
||||
def process_queue_data(filename,data,queue_type):
|
||||
''' main function for processing gathered data
|
||||
i use this central function for it, so it is at *one* place
|
||||
and it is easy to change the data handling at a later step without
|
||||
deconstructing the who code
|
||||
'''
|
||||
ana_dict = {}
|
||||
url_dict = {}
|
||||
|
||||
if queue_type=='doc_info':
|
||||
print('[v] Queue DocInfo Data %s' % (filename))
|
||||
name = find_name(filename)
|
||||
path = filename
|
||||
|
||||
# create a hash over the file path
|
||||
# hm, removed for now
|
||||
#path_hash = create_sha256(path)
|
||||
|
||||
# order data in dict for analyse queue
|
||||
ana_dict = {path : {'filename':name,'data':data}}
|
||||
# print(data)
|
||||
# print(ana_dict)
|
||||
|
||||
# add the data to queue
|
||||
add_queue(ana_q,ana_dict)
|
||||
|
||||
elif queue_type=='url':
|
||||
# prepare queue entry
|
||||
print('[v] Url Queue %s' % (data))
|
||||
url_dict = {'url':data,'filename':filename}
|
||||
sha256=create_sha256(data)
|
||||
url_d[sha256]=url_dict
|
||||
|
||||
# add dict to queue
|
||||
add_queue(url_q,url_dict)
|
||||
|
||||
else:
|
||||
print('[-] Sorry, unknown queue. DEBUG!')
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def add_queue(tqueue, data):
|
||||
''' wrapper function for adding easy data to
|
||||
created queues. otherwise the functions will be scattered with
|
||||
endless queue commands ;)
|
||||
'''
|
||||
|
||||
tqueue.put(data)
|
||||
#d=tqueue.get()
|
||||
#print(d)
|
||||
return True
|
||||
|
||||
def url_strip(url):
|
||||
url = url.rstrip("\n")
|
||||
url = url.rstrip("\r")
|
||||
@@ -52,32 +138,85 @@ def get_random_agent():
|
||||
|
||||
def get_DocInfo(filename, filehandle):
|
||||
''' the easy way to extract metadata
|
||||
|
||||
indirectObjects...
|
||||
there is an interesting situation, some pdfs seem to have the same information stored
|
||||
in different places, or things are overwritten or whatever
|
||||
this sometimes results in an extract output with indirect objects ... this is ugly
|
||||
|
||||
{'/Title': IndirectObject(111, 0), '/Producer': IndirectObject(112, 0), '/Creator': IndirectObject(113, 0), '/CreationDate': IndirectObject(114, 0), '/ModDate': IndirectObject(114, 0), '/Keywords': IndirectObject(115, 0), '/AAPL:Keywords': IndirectObject(116, 0)}
|
||||
|
||||
normally getObject() is the method to use, to fix this, however this was not working in the particular case.
|
||||
this thing might even bring up some more nasty things, as a (probably weak) defense and workaround
|
||||
the pdfobject is not used anymore after this function, data is converted to strings...
|
||||
bad example:
|
||||
'''
|
||||
|
||||
err_dict = {}
|
||||
real_extract = {}
|
||||
|
||||
fh = filehandle
|
||||
|
||||
try:
|
||||
extract = fh.documentInfo
|
||||
|
||||
except pdf.utils.PdfReadError as e:
|
||||
print('Error: %s' % e)
|
||||
err_dict={'error':str(e)}
|
||||
return -1
|
||||
|
||||
except PyPDF2.utils.PdfReadError as e:
|
||||
print('Error: %s' % e)
|
||||
err_dict={'error':str(e)}
|
||||
return -1
|
||||
|
||||
finally:
|
||||
process_queue_data(filename,err_dict,'doc_info')
|
||||
|
||||
print('-'*80)
|
||||
print('File: %s' % filename)
|
||||
for k in extract.keys():
|
||||
edata = '%s %s' % (k,extract[k])
|
||||
print(edata)
|
||||
print
|
||||
print('-'*80)
|
||||
# embed()
|
||||
# there are situations when documentinfo does not return anything
|
||||
# and extract is None
|
||||
if extract==None:
|
||||
err_dict={'error':'getDocumentInfo() returns None'}
|
||||
process_queue_data(filename,err_dict,'doc_info')
|
||||
return -1
|
||||
|
||||
|
||||
try:
|
||||
for k in extract.keys():
|
||||
key = str(k)
|
||||
value = str(extract[k])
|
||||
edata = '%s %s' % (key,value)
|
||||
print(edata)
|
||||
print
|
||||
real_extract[key]=value
|
||||
print('-'*80)
|
||||
|
||||
except PyPDF2.utils.PdfReadError as e:
|
||||
print('Error: %s' % e)
|
||||
err_dict={'error':str(e)}
|
||||
process_queue_data(filename,err_dict,'doc_info')
|
||||
return -1
|
||||
|
||||
|
||||
process_queue_data(filename,real_extract,'doc_info')
|
||||
|
||||
|
||||
def decrypt_empty_pdf(filename):
|
||||
''' this function simply tries to decrypt the pdf with the null password
|
||||
this does work, as long as no real password has been set
|
||||
if a complex password has been set -> john
|
||||
'''
|
||||
|
||||
fr = pdf.PdfFileReader(open(filename,"rb"))
|
||||
try:
|
||||
fr.decrypt('')
|
||||
|
||||
except NotImplementedError as e:
|
||||
print('Error: %s' % (e))
|
||||
print('Only algorithm code 1 and 2 are supported')
|
||||
#print('Error: %s' % (e))
|
||||
print('Error: File: %s encrypted. %s' % (filename,str(e)))
|
||||
return -1
|
||||
return fr
|
||||
|
||||
@@ -86,7 +225,7 @@ def check_encryption(filename):
|
||||
''' basic function to check if file is encrypted
|
||||
'''
|
||||
|
||||
print(filename)
|
||||
# print(filename)
|
||||
try:
|
||||
fr = pdf.PdfFileReader(open(filename,"rb"))
|
||||
except pdf.utils.PdfReadError as e:
|
||||
@@ -137,25 +276,36 @@ def download_pdf(url, args, header_data):
|
||||
req = requests.get(url,headers=header_data,verify=cert_check)
|
||||
#req = requests.get(url,headers=header_data,verify=False)
|
||||
data = req.content
|
||||
status_code = req.status_code
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
print('Error: %s' % e)
|
||||
return -1
|
||||
|
||||
except:
|
||||
print('Error: Probably something wrong with remote server')
|
||||
return -1
|
||||
|
||||
if status_code == 403:
|
||||
print('%s http/403 Forbidden' % (url))
|
||||
return -1
|
||||
|
||||
#print(len(data))
|
||||
return data
|
||||
|
||||
def store_pdf(url,data,outdir):
|
||||
''' storing the downloaded pdf data
|
||||
'''
|
||||
print('[v] store_pdf')
|
||||
name = find_name(url)
|
||||
|
||||
# only allow stored file a name with 50 chars
|
||||
name = name[:49] + '.pdf'
|
||||
print(len(name))
|
||||
if len(name)>50:
|
||||
name = name[:49] + '.pdf'
|
||||
#print(len(name))
|
||||
|
||||
save = "%s/%s" % (outdir,name)
|
||||
|
||||
try:
|
||||
f = open(save,"wb")
|
||||
except OSError as e:
|
||||
@@ -192,13 +342,18 @@ def seek_and_analyse(search,args,outdir):
|
||||
them together
|
||||
'''
|
||||
# use the search function of googlesearch to get the results
|
||||
urls = search_pdf(search,args)
|
||||
search_pdf(search,args)
|
||||
#urls = search_pdf(search,args)
|
||||
|
||||
|
||||
# *if* we get an answer
|
||||
if urls != -1:
|
||||
if url_q.empty()==False:
|
||||
#if urls != -1:
|
||||
# process through the list and get the pdfs
|
||||
for url in urls:
|
||||
while url_q.empty()==False:
|
||||
item=url_q.get()
|
||||
#print(item)
|
||||
url = item['url']
|
||||
grab_url(url,args,outdir)
|
||||
|
||||
def search_pdf(search, args):
|
||||
@@ -214,16 +369,26 @@ def search_pdf(search, args):
|
||||
|
||||
try:
|
||||
for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
|
||||
print(url)
|
||||
#print(url)
|
||||
# parse out the name of the file in the url
|
||||
filename=find_name(url)
|
||||
# add the file to queue
|
||||
process_queue_data(filename,url,'url')
|
||||
urls.append(url)
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
print('Error: %s' % e)
|
||||
return -1
|
||||
return urls
|
||||
#return urls
|
||||
|
||||
def run(args):
|
||||
|
||||
# outfile name
|
||||
if args.outfile:
|
||||
out_filename = args.outfile
|
||||
else:
|
||||
out_filename = 'pdfgrab_analysis'
|
||||
|
||||
# specify output directory
|
||||
outdir = args.outdir
|
||||
|
||||
@@ -250,14 +415,74 @@ def run(args):
|
||||
elif args.files_dir:
|
||||
directory = args.files_dir
|
||||
print('[+] Analyse pdfs in directory %s' % (directory))
|
||||
files = os.listdir(directory)
|
||||
try:
|
||||
files = os.listdir(directory)
|
||||
except:
|
||||
print('Error')
|
||||
return False
|
||||
|
||||
for f in files:
|
||||
fpath = '%s/%s' % (directory,f)
|
||||
_parse_pdf(fpath)
|
||||
# naive filter function, later usage of filemagic possible
|
||||
if f.find('.pdf')!=-1:
|
||||
fpath = '%s/%s' % (directory,f)
|
||||
_parse_pdf(fpath)
|
||||
|
||||
else:
|
||||
print('[-] Dunno what to do, bro.')
|
||||
|
||||
# move analysis dictionary in queue back to dictionary
|
||||
analysis_dict = {}
|
||||
while ana_q.empty()==False:
|
||||
item = ana_q.get()
|
||||
#print('item ', item)
|
||||
analysis_dict.update(item)
|
||||
|
||||
# ana_q is empty now
|
||||
|
||||
# create txt output
|
||||
sep = '-'*80 + '\n'
|
||||
txtout = "%s/%s.txt" % (outdir,out_filename)
|
||||
fwtxt = open(txtout,'w')
|
||||
#print(analysis_dict)
|
||||
for k in analysis_dict.keys():
|
||||
fwtxt.write(sep)
|
||||
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
|
||||
ddata = analysis_dict[k]['data']
|
||||
fwtxt.write(fname)
|
||||
for kdata in ddata.keys():
|
||||
metatxt = '%s:%s\n' % (kdata,ddata[kdata])
|
||||
fwtxt.write(metatxt)
|
||||
fwtxt.write(sep)
|
||||
fwtxt.close()
|
||||
|
||||
# create json output
|
||||
jsonout = "%s/%s.json" % (outdir,out_filename)
|
||||
fwjson = open(jsonout,'w')
|
||||
#for k in analysis_dict.keys():
|
||||
#print(analysis_dict[k])
|
||||
# jdata = json.dumps(analysis_dict[k])
|
||||
|
||||
#print(analysis_dict)
|
||||
jdata = json.dumps(analysis_dict)
|
||||
fwjson.write(jdata)
|
||||
fwjson.close()
|
||||
|
||||
# create url savefile
|
||||
#print('url_d: ', url_d)
|
||||
jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
|
||||
fwjson = open(jsonurlout,'w')
|
||||
jdata = json.dumps(url_d)
|
||||
fwjson.write(jdata)
|
||||
fwjson.close()
|
||||
|
||||
|
||||
txtout = "%s/%s_url.txt" % (outdir,out_filename)
|
||||
fwtxt = open(txtout,'w')
|
||||
for k in url_d.keys():
|
||||
ddata = url_d[k]
|
||||
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
|
||||
fwtxt.write(metatxt)
|
||||
fwtxt.close()
|
||||
|
||||
return 42
|
||||
# This is the end my friend.
|
||||
@@ -266,7 +491,7 @@ def main():
|
||||
parser_desc = "%s %s %s in %s" % (name,version,author,date)
|
||||
parser = argparse.ArgumentParser(prog = name, description=parser_desc)
|
||||
parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
|
||||
# parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
|
||||
parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
|
||||
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
|
||||
#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
|
||||
#########
|
||||
|
||||
Reference in New Issue
Block a user