release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list
This commit is contained in:
20
docs/Changelog
Normal file
20
docs/Changelog
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
Changelog
|
||||||
|
=========
|
||||||
|
|
||||||
|
Version 4.7
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* added html out
|
||||||
|
* added xmp meta testing
|
||||||
|
|
||||||
|
Version 4.6
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* added help for non-argument given at cli
|
||||||
|
* added googlesearch lib
|
||||||
|
|
||||||
|
Version 4.5
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* exported helper functions to libs/helper.py
|
||||||
|
* added libs/liblog.py
|
||||||
4
docs/Todo
Normal file
4
docs/Todo
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
* add xmp meta to output files
|
||||||
|
* code reordering
|
||||||
|
* clean up parsing functions
|
||||||
|
* add report formats
|
||||||
0
libs/__init__.py
Normal file
0
libs/__init__.py
Normal file
30
libs/libgoogle.py
Normal file
30
libs/libgoogle.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import googlesearch as gs
|
||||||
|
import urllib
|
||||||
|
from libs.libhelper import *
|
||||||
|
|
||||||
|
def get_random_agent():
|
||||||
|
return (gs.get_random_user_agent())
|
||||||
|
|
||||||
|
def search_pdf(search, args):
|
||||||
|
''' the function where googlesearch from mario vilas
|
||||||
|
is called
|
||||||
|
'''
|
||||||
|
|
||||||
|
search_stop = args.search_stop
|
||||||
|
|
||||||
|
query = '%s filetype:pdf' % search
|
||||||
|
# print(query)
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
|
||||||
|
#print(url)
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print('Error: %s' % e)
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
37
libs/libhelper.py
Normal file
37
libs/libhelper.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from Crypto.Hash import SHA256
|
||||||
|
|
||||||
|
def make_directory(outdir):
|
||||||
|
''' naive mkdir function '''
|
||||||
|
try:
|
||||||
|
os.mkdir(outdir)
|
||||||
|
except:
|
||||||
|
# print("[W] mkdir, some error, directory probably exists")
|
||||||
|
pass
|
||||||
|
|
||||||
|
def url_strip(url):
|
||||||
|
url = url.rstrip("\n")
|
||||||
|
url = url.rstrip("\r")
|
||||||
|
return url
|
||||||
|
|
||||||
|
def create_sha256(hdata):
|
||||||
|
''' introduced to create hashes of filenames, to have a uniqid
|
||||||
|
of course hashes of the file itself will be the next topic
|
||||||
|
'''
|
||||||
|
hobject = SHA256.new(data=hdata.encode())
|
||||||
|
return (hobject.hexdigest())
|
||||||
|
|
||||||
|
def find_name(pdf):
|
||||||
|
''' simply parses the urlencoded name and extracts the storage name
|
||||||
|
i would not be surprised this naive approach can lead to fuckups
|
||||||
|
'''
|
||||||
|
|
||||||
|
# find the name of the file
|
||||||
|
name = pdf.split("/")
|
||||||
|
a = len(name)
|
||||||
|
name = name[a - 1]
|
||||||
|
# print(name)
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
17
libs/liblog.py
Normal file
17
libs/liblog.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
file_handler = logging.FileHandler('pdfgrab.log')
|
||||||
|
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
|
||||||
|
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.addHandler(console_handler)
|
||||||
742
pdfgrab.py
742
pdfgrab.py
@@ -1,61 +1,35 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
#####################
|
#####################
|
||||||
# yay - old tool adjusted for python3, using googlesearch now
|
|
||||||
# and not some self crafted f00
|
|
||||||
#
|
|
||||||
# new features, new layout, new new :>
|
# new features, new layout, new new :>
|
||||||
# by dash at the end of September 2019
|
# by dash
|
||||||
#
|
|
||||||
# TODO
|
|
||||||
# * add complete path in output as well as url where pdf came from
|
|
||||||
# -> if url not exist like -F mode, then the local path
|
|
||||||
# * clean up code
|
|
||||||
# * fine tune google search
|
|
||||||
# * add random timeout for new requests
|
|
||||||
# -> maybe not necessary, gs has it ...
|
|
||||||
# -> sort of necessary, on the other hand use proxychains man
|
|
||||||
# * uh oh some fancy c0l0rs
|
|
||||||
# * add thread support
|
|
||||||
# * add scrape mode, to search for pdfs at the website itself
|
|
||||||
# * add current error conditions to logfile
|
|
||||||
#
|
|
||||||
# Done
|
|
||||||
# * add url list to output
|
|
||||||
# * queues added, but no thread support yet
|
|
||||||
# * json file output
|
|
||||||
# * txt file output
|
|
||||||
# * outfilename hardcoded
|
|
||||||
# * add decryption routine
|
|
||||||
# * catch ssl exceptions
|
|
||||||
# * add random useragent for google and website pdf gathering
|
|
||||||
# * set option for certificate verification, default is true
|
|
||||||
# * catch conn refused connections
|
|
||||||
# * catch filename to long thingy
|
|
||||||
|
|
||||||
import os
|
import xml
|
||||||
import sys
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import queue
|
import queue
|
||||||
import urllib
|
import urllib
|
||||||
import argparse
|
from json2html import *
|
||||||
import requests
|
|
||||||
|
|
||||||
# remove somewhen ;)
|
|
||||||
from IPython import embed
|
|
||||||
|
|
||||||
from PyPDF2 import pdf
|
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
from Crypto.Hash import SHA256
|
|
||||||
from collections import deque
|
|
||||||
|
|
||||||
# googlesearch library
|
# googlesearch library
|
||||||
import googlesearch as gs
|
import googlesearch as gs
|
||||||
|
import requests
|
||||||
|
from PyPDF2 import pdf
|
||||||
|
|
||||||
|
# functions to extern files
|
||||||
|
from libs.liblog import logger
|
||||||
|
from libs.libhelper import *
|
||||||
|
from libs.libgoogle import *
|
||||||
|
|
||||||
|
from IPython import embed
|
||||||
|
|
||||||
# some variables in regard of the tool itself
|
# some variables in regard of the tool itself
|
||||||
name = 'pdfgrab'
|
name = 'pdfgrab'
|
||||||
version = '0.4.4'
|
version = '0.4.7'
|
||||||
author = 'dash'
|
author = 'dash'
|
||||||
date = '2019'
|
date = '2019'
|
||||||
|
|
||||||
# queues for processing
|
# queues for processing
|
||||||
# this queue holds the URL locations of files to download
|
# this queue holds the URL locations of files to download
|
||||||
@@ -68,76 +42,93 @@ pdf_q = queue.Queue()
|
|||||||
# this is the analysis queue, keeping the data for further processing
|
# this is the analysis queue, keeping the data for further processing
|
||||||
ana_q = queue.Queue()
|
ana_q = queue.Queue()
|
||||||
|
|
||||||
def create_sha256(hdata):
|
|
||||||
''' introduced to create hashes of filenames, to have a uniqid
|
|
||||||
of course hashes of the file itself will be the next topic
|
|
||||||
'''
|
|
||||||
hobject = SHA256.new(data=hdata.encode())
|
|
||||||
return (hobject.hexdigest())
|
|
||||||
|
|
||||||
def process_queue_data(filename,data,queue_type):
|
|
||||||
''' main function for processing gathered data
|
|
||||||
i use this central function for it, so it is at *one* place
|
|
||||||
and it is easy to change the data handling at a later step without
|
|
||||||
deconstructing the who code
|
|
||||||
'''
|
|
||||||
ana_dict = {}
|
|
||||||
url_dict = {}
|
|
||||||
|
|
||||||
if queue_type=='doc_info':
|
|
||||||
print('[v] Queue DocInfo Data %s' % (filename))
|
|
||||||
name = find_name(filename)
|
|
||||||
path = filename
|
|
||||||
|
|
||||||
# create a hash over the file path
|
|
||||||
# hm, removed for now
|
|
||||||
#path_hash = create_sha256(path)
|
|
||||||
|
|
||||||
# order data in dict for analyse queue
|
|
||||||
ana_dict = {path : {'filename':name,'data':data}}
|
|
||||||
# print(data)
|
|
||||||
# print(ana_dict)
|
|
||||||
|
|
||||||
# add the data to queue
|
|
||||||
add_queue(ana_q,ana_dict)
|
|
||||||
|
|
||||||
elif queue_type=='url':
|
|
||||||
# prepare queue entry
|
|
||||||
print('[v] Url Queue %s' % (data))
|
|
||||||
url_dict = {'url':data,'filename':filename}
|
|
||||||
sha256=create_sha256(data)
|
|
||||||
url_d[sha256]=url_dict
|
|
||||||
|
|
||||||
# add dict to queue
|
|
||||||
add_queue(url_q,url_dict)
|
|
||||||
|
|
||||||
else:
|
|
||||||
print('[-] Sorry, unknown queue. DEBUG!')
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def add_queue(tqueue, data):
|
def add_queue(tqueue, data):
|
||||||
''' wrapper function for adding easy data to
|
''' wrapper function for adding easy data to
|
||||||
created queues. otherwise the functions will be scattered with
|
created queues. otherwise the functions will be scattered with
|
||||||
endless queue commands ;)
|
endless queue commands ;)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
tqueue.put(data)
|
tqueue.put(data)
|
||||||
#d=tqueue.get()
|
# d=tqueue.get()
|
||||||
#print(d)
|
#logging.debug(d)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def url_strip(url):
|
def process_queue_data(filename, data, queue_type):
|
||||||
url = url.rstrip("\n")
|
''' main function for processing gathered data
|
||||||
url = url.rstrip("\r")
|
i use this central function for it, so it is at *one* place
|
||||||
return url
|
and it is easy to change the data handling at a later step without
|
||||||
|
deconstructing the who code
|
||||||
|
'''
|
||||||
|
ana_dict = {}
|
||||||
|
url_dict = {}
|
||||||
|
|
||||||
def get_random_agent():
|
if queue_type == 'doc_info':
|
||||||
return (gs.get_random_user_agent())
|
logger.info('Queue DocInfo Data {0}'.format(filename))
|
||||||
|
name = find_name(filename)
|
||||||
|
path = filename
|
||||||
|
|
||||||
|
# create a hash over the file path
|
||||||
|
# hm, removed for now
|
||||||
|
# path_hash = create_sha256(path)
|
||||||
|
|
||||||
|
# order data in dict for analyse queue
|
||||||
|
ana_dict = {path: {'filename': name, 'data': data}}
|
||||||
|
#print('data:',data)
|
||||||
|
#print('ana_dcit:',ana_dict)
|
||||||
|
|
||||||
|
# add the data to queue
|
||||||
|
add_queue(ana_q, ana_dict)
|
||||||
|
|
||||||
|
elif queue_type == 'doc_xmp_info':
|
||||||
|
logger.info('Queue DocXMPInfo Data {0}'.format(filename))
|
||||||
|
logger.warning('DocXMPInfo json processing not supported {0}'.format(filename))
|
||||||
|
|
||||||
|
elif queue_type == 'url':
|
||||||
|
# prepare queue entry
|
||||||
|
logger.info('Url Queue {0}'.format(data))
|
||||||
|
url_dict = {'url': data, 'filename': filename}
|
||||||
|
sha256 = create_sha256(data)
|
||||||
|
url_d[sha256] = url_dict
|
||||||
|
|
||||||
|
# add dict to queue
|
||||||
|
add_queue(url_q, url_dict)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('[-] Sorry, unknown queue. DEBUG!')
|
||||||
|
logger.critical('Unknown queue')
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_xmp_meta_data(filename, filehandle):
|
||||||
|
''' get the xmp meta data
|
||||||
|
'''
|
||||||
|
|
||||||
|
err_dict = {}
|
||||||
|
real_extract = {}
|
||||||
|
xmp_dict = {}
|
||||||
|
|
||||||
|
fh = filehandle
|
||||||
|
|
||||||
|
try:
|
||||||
|
xmp_meta = fh.getXmpMetadata()
|
||||||
|
|
||||||
|
except xml.parsers.expat.ExpatError as e:
|
||||||
|
print('Error: %s' % e)
|
||||||
|
err_dict = {'error': str(e)}
|
||||||
|
return -1
|
||||||
|
|
||||||
|
finally:
|
||||||
|
process_queue_data(filename, err_dict, 'doc_xmp_info')
|
||||||
|
|
||||||
|
if xmp_meta != None:
|
||||||
|
print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(xmp_meta.pdf_producer,xmp_meta.pdf_pdfversion,xmp_meta.dc_contributor,xmp_meta.dc_creator,xmp_meta.dc_date,xmp_meta.dc_subject))
|
||||||
|
xmp_dict = {}
|
||||||
|
|
||||||
|
return xmp_dict
|
||||||
|
|
||||||
def get_DocInfo(filename, filehandle):
|
def get_DocInfo(filename, filehandle):
|
||||||
''' the easy way to extract metadata
|
''' the easy way to extract metadata
|
||||||
|
|
||||||
indirectObjects...
|
indirectObjects...
|
||||||
there is an interesting situation, some pdfs seem to have the same information stored
|
there is an interesting situation, some pdfs seem to have the same information stored
|
||||||
@@ -152,357 +143,346 @@ def get_DocInfo(filename, filehandle):
|
|||||||
bad example:
|
bad example:
|
||||||
'''
|
'''
|
||||||
|
|
||||||
err_dict = {}
|
err_dict = {}
|
||||||
real_extract = {}
|
real_extract = {}
|
||||||
|
|
||||||
fh = filehandle
|
fh = filehandle
|
||||||
|
|
||||||
try:
|
try:
|
||||||
extract = fh.documentInfo
|
extract = fh.documentInfo
|
||||||
|
|
||||||
except pdf.utils.PdfReadError as e:
|
except pdf.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
print('Error: %s' % e)
|
||||||
err_dict={'error':str(e)}
|
err_dict = {'error': str(e)}
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
except PyPDF2.utils.PdfReadError as e:
|
except PyPDF2.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
print('Error: %s' % e)
|
||||||
err_dict={'error':str(e)}
|
err_dict = {'error': str(e)}
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
process_queue_data(filename,err_dict,'doc_info')
|
process_queue_data(filename, err_dict, 'doc_info')
|
||||||
|
|
||||||
print('-'*80)
|
print('-' * 80)
|
||||||
print('File: %s' % filename)
|
print('File: %s' % filename)
|
||||||
# embed()
|
# embed()
|
||||||
# there are situations when documentinfo does not return anything
|
# there are situations when documentinfo does not return anything
|
||||||
# and extract is None
|
# and extract is None
|
||||||
if extract==None:
|
if extract == None:
|
||||||
err_dict={'error':'getDocumentInfo() returns None'}
|
err_dict = {'error': 'getDocumentInfo() returns None'}
|
||||||
process_queue_data(filename,err_dict,'doc_info')
|
process_queue_data(filename, err_dict, 'doc_info')
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
try:
|
||||||
|
for k in extract.keys():
|
||||||
|
key = str(k)
|
||||||
|
value = str(extract[k])
|
||||||
|
edata = '%s %s' % (key, value)
|
||||||
|
print(edata)
|
||||||
|
print
|
||||||
|
real_extract[key] = value
|
||||||
|
print('-' * 80)
|
||||||
|
|
||||||
try:
|
except PyPDF2.utils.PdfReadError as e:
|
||||||
for k in extract.keys():
|
print('Error: %s' % e)
|
||||||
key = str(k)
|
err_dict = {'error': str(e)}
|
||||||
value = str(extract[k])
|
process_queue_data(filename, err_dict, 'doc_info')
|
||||||
edata = '%s %s' % (key,value)
|
return -1
|
||||||
print(edata)
|
|
||||||
print
|
|
||||||
real_extract[key]=value
|
|
||||||
print('-'*80)
|
|
||||||
|
|
||||||
except PyPDF2.utils.PdfReadError as e:
|
process_queue_data(filename, real_extract, 'doc_info')
|
||||||
print('Error: %s' % e)
|
|
||||||
err_dict={'error':str(e)}
|
|
||||||
process_queue_data(filename,err_dict,'doc_info')
|
|
||||||
return -1
|
|
||||||
|
|
||||||
|
|
||||||
process_queue_data(filename,real_extract,'doc_info')
|
|
||||||
|
|
||||||
|
|
||||||
def decrypt_empty_pdf(filename):
|
def decrypt_empty_pdf(filename):
|
||||||
''' this function simply tries to decrypt the pdf with the null password
|
''' this function simply tries to decrypt the pdf with the null password
|
||||||
this does work, as long as no real password has been set
|
this does work, as long as no real password has been set
|
||||||
if a complex password has been set -> john
|
if a complex password has been set -> john
|
||||||
'''
|
'''
|
||||||
|
|
||||||
fr = pdf.PdfFileReader(open(filename,"rb"))
|
fr = pdf.PdfFileReader(open(filename, "rb"))
|
||||||
try:
|
try:
|
||||||
fr.decrypt('')
|
fr.decrypt('')
|
||||||
|
|
||||||
except NotImplementedError as e:
|
except NotImplementedError as e:
|
||||||
#print('Error: %s' % (e))
|
# print('Error: %s' % (e))
|
||||||
print('Error: File: %s encrypted. %s' % (filename,str(e)))
|
print('Error: File: %s encrypted. %s' % (filename, str(e)))
|
||||||
return -1
|
return -1
|
||||||
return fr
|
return fr
|
||||||
|
|
||||||
|
|
||||||
def check_encryption(filename):
|
def check_encryption(filename):
|
||||||
''' basic function to check if file is encrypted
|
''' basic function to check if file is encrypted
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# print(filename)
|
# print(filename)
|
||||||
try:
|
try:
|
||||||
fr = pdf.PdfFileReader(open(filename,"rb"))
|
fr = pdf.PdfFileReader(open(filename, "rb"))
|
||||||
except pdf.utils.PdfReadError as e:
|
except pdf.utils.PdfReadError as e:
|
||||||
print('Error: %s' % e)
|
print('Error: %s' % e)
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
if fr.getIsEncrypted()==True:
|
if fr.getIsEncrypted() == True:
|
||||||
print('[i] File encrypted %s' % filename)
|
print('[i] File encrypted %s' % filename)
|
||||||
nfr = decrypt_empty_pdf(filename)
|
nfr = decrypt_empty_pdf(filename)
|
||||||
if nfr != -1:
|
if nfr != -1:
|
||||||
get_DocInfo(filename,nfr)
|
get_DocInfo(filename, nfr)
|
||||||
|
get_xmp_meta_data(filename,nfr)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
get_DocInfo(filename,fr)
|
get_DocInfo(filename, fr)
|
||||||
|
get_xmp_meta_data(filename,fr)
|
||||||
|
|
||||||
#fr.close()
|
# fr.close()
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def find_name(pdf):
|
|
||||||
''' simply parses the urlencoded name and extracts the storage name
|
|
||||||
i would not be surprised this naive approach can lead to fuckups
|
|
||||||
'''
|
|
||||||
|
|
||||||
#find the name of the file
|
|
||||||
name = pdf.split("/")
|
|
||||||
a = len(name)
|
|
||||||
name = name[a-1]
|
|
||||||
#print(name)
|
|
||||||
|
|
||||||
return name
|
|
||||||
|
|
||||||
def make_directory(outdir):
|
|
||||||
''' naive mkdir function '''
|
|
||||||
try:
|
|
||||||
os.mkdir(outdir)
|
|
||||||
except:
|
|
||||||
#print("[W] mkdir, some error, directory probably exists")
|
|
||||||
pass
|
|
||||||
|
|
||||||
def download_pdf(url, args, header_data):
|
def download_pdf(url, args, header_data):
|
||||||
''' downloading the pdfile for later analysis '''
|
''' downloading the pdfile for later analysis '''
|
||||||
|
|
||||||
# check the remote tls certificate or not?
|
# check the remote tls certificate or not?
|
||||||
cert_check = args.cert_check
|
cert_check = args.cert_check
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = requests.get(url,headers=header_data,verify=cert_check)
|
req = requests.get(url, headers=header_data, verify=cert_check)
|
||||||
#req = requests.get(url,headers=header_data,verify=False)
|
# req = requests.get(url,headers=header_data,verify=False)
|
||||||
data = req.content
|
data = req.content
|
||||||
status_code = req.status_code
|
status_code = req.status_code
|
||||||
|
|
||||||
except requests.exceptions.SSLError as e:
|
except requests.exceptions.SSLError as e:
|
||||||
print('Error: %s' % e)
|
print('Error: %s' % e)
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
except:
|
except:
|
||||||
print('Error: Probably something wrong with remote server')
|
print('Error: Probably something wrong with remote server')
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
if status_code == 403:
|
if status_code == 403:
|
||||||
print('%s http/403 Forbidden' % (url))
|
print('%s http/403 Forbidden' % (url))
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
#print(len(data))
|
# print(len(data))
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def store_pdf(url,data,outdir):
|
|
||||||
''' storing the downloaded pdf data
|
|
||||||
'''
|
|
||||||
print('[v] store_pdf')
|
|
||||||
name = find_name(url)
|
|
||||||
|
|
||||||
# only allow stored file a name with 50 chars
|
def store_pdf(url, data, outdir):
|
||||||
if len(name)>50:
|
''' storing the downloaded pdf data
|
||||||
name = name[:49] + '.pdf'
|
'''
|
||||||
#print(len(name))
|
|
||||||
|
|
||||||
save = "%s/%s" % (outdir,name)
|
logger.info('Store pdf')
|
||||||
|
name = find_name(url)
|
||||||
|
|
||||||
try:
|
# only allow stored file a name with 50 chars
|
||||||
f = open(save,"wb")
|
if len(name) > 50:
|
||||||
except OSError as e:
|
name = name[:49] + '.pdf'
|
||||||
print('Error: %s' % (e))
|
# print(len(name))
|
||||||
return -1
|
|
||||||
|
|
||||||
ret=f.write(data)
|
save = "%s/%s" % (outdir, name)
|
||||||
print('[+] Written %d bytes for File: %s' % (ret,save))
|
|
||||||
f.close()
|
try:
|
||||||
|
f = open(save, "wb")
|
||||||
|
except OSError as e:
|
||||||
|
print('Error: %s' % (e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
ret = f.write(data)
|
||||||
|
logger.info('Written {0} bytes for file: {1}'.format(ret,save))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
# return the savepath
|
||||||
|
return save
|
||||||
|
|
||||||
# return the savepath
|
|
||||||
return save
|
|
||||||
|
|
||||||
def _parse_pdf(filename):
|
def _parse_pdf(filename):
|
||||||
''' the real parsing function '''
|
''' the real parsing function '''
|
||||||
|
|
||||||
|
ret = check_encryption(filename)
|
||||||
|
return ret
|
||||||
|
|
||||||
ret = check_encryption(filename)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def grab_url(url, args, outdir):
|
def grab_url(url, args, outdir):
|
||||||
''' function keeping all the steps for the user call of grabbing
|
''' function keeping all the steps for the user call of grabbing
|
||||||
just one pdf and analysing it
|
just one pdf and analysing it
|
||||||
'''
|
'''
|
||||||
header_data={'User-Agent':get_random_agent()}
|
header_data = {'User-Agent': get_random_agent()}
|
||||||
data = download_pdf(url,args, header_data)
|
data = download_pdf(url, args, header_data)
|
||||||
if data != -1:
|
if data != -1:
|
||||||
savepath = store_pdf(url, data, outdir)
|
savepath = store_pdf(url, data, outdir)
|
||||||
_parse_pdf(savepath)
|
_parse_pdf(savepath)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def seek_and_analyse(search,args,outdir):
|
|
||||||
''' function for keeping all the steps of searching for pdfs and analysing
|
|
||||||
them together
|
|
||||||
'''
|
|
||||||
# use the search function of googlesearch to get the results
|
|
||||||
search_pdf(search,args)
|
|
||||||
#urls = search_pdf(search,args)
|
|
||||||
|
|
||||||
|
|
||||||
# *if* we get an answer
|
def seek_and_analyse(search, args, outdir):
|
||||||
if url_q.empty()==False:
|
''' function for keeping all the steps of searching for pdfs and analysing
|
||||||
#if urls != -1:
|
them together
|
||||||
# process through the list and get the pdfs
|
'''
|
||||||
while url_q.empty()==False:
|
# use the search function of googlesearch to get the results
|
||||||
item=url_q.get()
|
urls=search_pdf(search, args)
|
||||||
#print(item)
|
for item in urls:
|
||||||
url = item['url']
|
filename = find_name(item)
|
||||||
grab_url(url,args,outdir)
|
process_queue_data(filename, item, 'url')
|
||||||
|
|
||||||
def search_pdf(search, args):
|
# urls = search_pdf(search,args)
|
||||||
''' the function where googlesearch from mario vilas
|
|
||||||
is called
|
|
||||||
'''
|
|
||||||
|
|
||||||
search_stop = args.search_stop
|
# *if* we get an answer
|
||||||
|
if url_q.empty() == False:
|
||||||
|
# if urls != -1:
|
||||||
|
# process through the list and get the pdfs
|
||||||
|
while url_q.empty() == False:
|
||||||
|
item = url_q.get()
|
||||||
|
# print(item)
|
||||||
|
url = item['url']
|
||||||
|
grab_url(url, args, outdir)
|
||||||
|
|
||||||
query='%s filetype:pdf' % search
|
|
||||||
#print(query)
|
|
||||||
urls = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
|
|
||||||
#print(url)
|
|
||||||
# parse out the name of the file in the url
|
|
||||||
filename=find_name(url)
|
|
||||||
# add the file to queue
|
|
||||||
process_queue_data(filename,url,'url')
|
|
||||||
urls.append(url)
|
|
||||||
|
|
||||||
except urllib.error.HTTPError as e:
|
|
||||||
print('Error: %s' % e)
|
|
||||||
return -1
|
|
||||||
#return urls
|
|
||||||
|
|
||||||
def run(args):
|
def run(args):
|
||||||
|
|
||||||
# outfile name
|
# initialize logger
|
||||||
if args.outfile:
|
logger.info('{0} Started'.format(name))
|
||||||
out_filename = args.outfile
|
|
||||||
else:
|
|
||||||
out_filename = 'pdfgrab_analysis'
|
|
||||||
|
|
||||||
# specify output directory
|
# outfile name
|
||||||
outdir = args.outdir
|
if args.outfile:
|
||||||
|
out_filename = args.outfile
|
||||||
|
else:
|
||||||
|
out_filename = 'pdfgrab_analysis'
|
||||||
|
|
||||||
# create output directory
|
# specify output directory
|
||||||
make_directory(outdir)
|
outdir = args.outdir
|
||||||
|
|
||||||
# lets see what the object is
|
# create output directory
|
||||||
if args.url_single:
|
make_directory(outdir)
|
||||||
url = args.url_single
|
|
||||||
print('[+] Grabbing %s' % (url))
|
|
||||||
grab_url(url, args,outdir)
|
|
||||||
|
|
||||||
elif args.file_single:
|
# lets see what the object is
|
||||||
pdffile = args.file_single
|
if args.url_single:
|
||||||
print('[+] Parsing %s' % (pdffile))
|
url = args.url_single
|
||||||
_parse_pdf(pdffile)
|
logger.info('Grabbing {0}'.format(url))
|
||||||
|
logger.write_to_log('Grabbing %s' % (url))
|
||||||
|
grab_url(url, args, outdir)
|
||||||
|
|
||||||
elif args.search:
|
elif args.file_single:
|
||||||
search = args.search
|
pdffile = args.file_single
|
||||||
#print(args)
|
logger.info('Parsing {0}'.format(pdffile))
|
||||||
print('[+] Seek and de...erm...analysing %s' % (search))
|
_parse_pdf(pdffile)
|
||||||
seek_and_analyse(search,args,outdir)
|
|
||||||
|
|
||||||
elif args.files_dir:
|
elif args.search:
|
||||||
directory = args.files_dir
|
search = args.search
|
||||||
print('[+] Analyse pdfs in directory %s' % (directory))
|
logger.info('Seek and analyse {0}'.format(search))
|
||||||
try:
|
seek_and_analyse(search, args, outdir)
|
||||||
files = os.listdir(directory)
|
|
||||||
except:
|
|
||||||
print('Error')
|
|
||||||
return False
|
|
||||||
|
|
||||||
for f in files:
|
elif args.files_dir:
|
||||||
# naive filter function, later usage of filemagic possible
|
directory = args.files_dir
|
||||||
if f.find('.pdf')!=-1:
|
logger.info('Analyse pdfs in directory {0}'.format(directory))
|
||||||
fpath = '%s/%s' % (directory,f)
|
try:
|
||||||
_parse_pdf(fpath)
|
files = os.listdir(directory)
|
||||||
|
except:
|
||||||
|
logger.warning('Error in args.files_dir')
|
||||||
|
return False
|
||||||
|
|
||||||
else:
|
for f in files:
|
||||||
print('[-] Dunno what to do, bro.')
|
# naive filter function, later usage of filemagic possible
|
||||||
|
if f.find('.pdf') != -1:
|
||||||
|
fpath = '%s/%s' % (directory, f)
|
||||||
|
_parse_pdf(fpath)
|
||||||
|
|
||||||
# move analysis dictionary in queue back to dictionary
|
else:
|
||||||
analysis_dict = {}
|
print('[-] Dunno what to do, bro. Use help. {0} -h'.format(sys.argv[0]))
|
||||||
while ana_q.empty()==False:
|
|
||||||
item = ana_q.get()
|
|
||||||
#print('item ', item)
|
|
||||||
analysis_dict.update(item)
|
|
||||||
|
|
||||||
# ana_q is empty now
|
# move analysis dictionary in queue back to dictionary
|
||||||
|
analysis_dict = {}
|
||||||
|
while ana_q.empty() == False:
|
||||||
|
item = ana_q.get()
|
||||||
|
# print('item ', item)
|
||||||
|
analysis_dict.update(item)
|
||||||
|
|
||||||
# create txt output
|
#print('dict:',analysis_dict)
|
||||||
sep = '-'*80 + '\n'
|
# ana_q is empty now
|
||||||
txtout = "%s/%s.txt" % (outdir,out_filename)
|
|
||||||
fwtxt = open(txtout,'w')
|
|
||||||
#print(analysis_dict)
|
|
||||||
for k in analysis_dict.keys():
|
|
||||||
fwtxt.write(sep)
|
|
||||||
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
|
|
||||||
ddata = analysis_dict[k]['data']
|
|
||||||
fwtxt.write(fname)
|
|
||||||
for kdata in ddata.keys():
|
|
||||||
metatxt = '%s:%s\n' % (kdata,ddata[kdata])
|
|
||||||
fwtxt.write(metatxt)
|
|
||||||
fwtxt.write(sep)
|
|
||||||
fwtxt.close()
|
|
||||||
|
|
||||||
# create json output
|
# create txt output
|
||||||
jsonout = "%s/%s.json" % (outdir,out_filename)
|
sep = '-' * 80 + '\n'
|
||||||
fwjson = open(jsonout,'w')
|
txtout = "%s/%s.txt" % (outdir, out_filename)
|
||||||
#for k in analysis_dict.keys():
|
fwtxt = open(txtout, 'w')
|
||||||
#print(analysis_dict[k])
|
# print(analysis_dict)
|
||||||
# jdata = json.dumps(analysis_dict[k])
|
for k in analysis_dict.keys():
|
||||||
|
fwtxt.write(sep)
|
||||||
|
fname = 'File: %s\n' % (analysis_dict[k]['filename'])
|
||||||
|
ddata = analysis_dict[k]['data']
|
||||||
|
fwtxt.write(fname)
|
||||||
|
for kdata in ddata.keys():
|
||||||
|
metatxt = '%s:%s\n' % (kdata, ddata[kdata])
|
||||||
|
fwtxt.write(metatxt)
|
||||||
|
fwtxt.write(sep)
|
||||||
|
fwtxt.close()
|
||||||
|
|
||||||
#print(analysis_dict)
|
# create json output
|
||||||
jdata = json.dumps(analysis_dict)
|
jsonout = "%s/%s.json" % (outdir, out_filename)
|
||||||
fwjson.write(jdata)
|
fwjson = open(jsonout, 'w')
|
||||||
fwjson.close()
|
|
||||||
|
|
||||||
# create url savefile
|
# print(analysis_dict)
|
||||||
#print('url_d: ', url_d)
|
jdata = json.dumps(analysis_dict)
|
||||||
jsonurlout = "%s/%s_url.json" % (outdir,out_filename)
|
fwjson.write(jdata)
|
||||||
fwjson = open(jsonurlout,'w')
|
fwjson.close()
|
||||||
jdata = json.dumps(url_d)
|
|
||||||
fwjson.write(jdata)
|
# create html from json
|
||||||
fwjson.close()
|
htmlout = "%s/%s.html" % (outdir, out_filename)
|
||||||
|
fwhtml = open(htmlout,'w')
|
||||||
|
#print(jdata)
|
||||||
|
html = json2html.convert(json = jdata)
|
||||||
|
fwhtml.write(html)
|
||||||
|
fwhtml.close()
|
||||||
|
|
||||||
|
|
||||||
txtout = "%s/%s_url.txt" % (outdir,out_filename)
|
# create url savefile
|
||||||
fwtxt = open(txtout,'w')
|
# print('url_d: ', url_d)
|
||||||
for k in url_d.keys():
|
jsonurlout = "%s/%s_url.json" % (outdir, out_filename)
|
||||||
ddata = url_d[k]
|
fwjson = open(jsonurlout, 'w')
|
||||||
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
|
jdata = json.dumps(url_d)
|
||||||
fwtxt.write(metatxt)
|
fwjson.write(jdata)
|
||||||
fwtxt.close()
|
fwjson.close()
|
||||||
|
|
||||||
return 42
|
txtout = "%s/%s_url.txt" % (outdir, out_filename)
|
||||||
# This is the end my friend.
|
fwtxt = open(txtout, 'w')
|
||||||
|
for k in url_d.keys():
|
||||||
|
ddata = url_d[k]
|
||||||
|
metatxt = '%s:%s\n' % (ddata['url'], ddata['filename'])
|
||||||
|
fwtxt.write(metatxt)
|
||||||
|
fwtxt.close()
|
||||||
|
|
||||||
|
return 42
|
||||||
|
|
||||||
|
|
||||||
|
# This is the end my friend.
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser_desc = "%s %s %s in %s" % (name,version,author,date)
|
parser_desc = "%s %s %s in %s" % (name, version, author, date)
|
||||||
parser = argparse.ArgumentParser(prog = name, description=parser_desc)
|
parser = argparse.ArgumentParser(prog=name, description=parser_desc)
|
||||||
parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
|
parser.add_argument('-O', '--outdir', action='store', dest='outdir', required=False,
|
||||||
parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
|
help="define the outdirectory for downloaded files and analysis output", default='pdfgrab')
|
||||||
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
|
parser.add_argument('-o', '--outfile', action='store', dest='outfile', required=False,
|
||||||
#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
|
help="define file with analysis output, if no parameter given it is outdir/pdfgrab_analysis, please note outfile is *always* written to output directory so do not add the dir as extra path")
|
||||||
#########
|
parser.add_argument('-u', '--url', action='store', dest='url_single', required=False,
|
||||||
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
|
help="grab pdf from specified url for analysis", default=None)
|
||||||
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
|
# parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
|
||||||
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
|
#########
|
||||||
parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
|
parser.add_argument('-f', '--file', action='store', dest='file_single', required=False,
|
||||||
parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
|
help="specify local path of pdf for analysis", default=None)
|
||||||
|
parser.add_argument('-F', '--files-dir', action='store', dest='files_dir', required=False,
|
||||||
|
help="specify local path of *directory* with pdf *files* for analysis", default=None)
|
||||||
|
parser.add_argument('-s', '--search', action='store', dest='search', required=False,
|
||||||
|
help="specify domain or tld to scrape for pdf-files", default=None)
|
||||||
|
parser.add_argument('-sn', '--search-number', action='store', dest='search_stop', required=False,
|
||||||
|
help="specify how many files are searched", default=10, type=int)
|
||||||
|
parser.add_argument('-z', '--disable-cert-check', action='store_false', dest='cert_check', required=False,
|
||||||
|
help="if the target domain(s) run with old or bad certificates", default=True)
|
||||||
|
|
||||||
|
if len(sys.argv)<2:
|
||||||
|
parser.print_help(sys.stderr)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
run(args)
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
run(args)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user