cert-check, random-user-agent, catching too many requests
This commit is contained in:
@@ -123,14 +123,14 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
|
|||||||
```
|
```
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
* json output
|
* json file-output
|
||||||
* txt output
|
* txt file-output
|
||||||
* catch conn refused connections
|
* catch conn refused connections
|
||||||
* set option for certificate verification, default is false
|
* ~~set option for certificate verification, default is true~~
|
||||||
* complete analyse.txt and seperated
|
* complete analyse.txt and seperated
|
||||||
* clean up code
|
* clean up code
|
||||||
* do more testing
|
* do more testing
|
||||||
* add random useragent for google and website pdf gathering
|
* ~~add random useragent for google and website pdf gathering~~
|
||||||
* ~~add decryption routine~~
|
* ~~add decryption routine~~
|
||||||
* ~~catch ssl exceptions~~
|
* ~~catch ssl exceptions~~
|
||||||
|
|
||||||
|
|||||||
125
pdfgrab.py
125
pdfgrab.py
@@ -4,45 +4,55 @@
|
|||||||
# and not some self crafted f00
|
# and not some self crafted f00
|
||||||
#
|
#
|
||||||
# new features, new layout, new new :>
|
# new features, new layout, new new :>
|
||||||
# dash in end of September 2019
|
# by dash at the end of September 2019
|
||||||
#
|
|
||||||
#
|
#
|
||||||
# TODO
|
# TODO
|
||||||
# * json output
|
# * json file output
|
||||||
# * txt output
|
# * txt file output
|
||||||
# * catch conn refused connections
|
|
||||||
# * set option for certificate verification, default is false
|
|
||||||
# * complete analyse.txt and seperated
|
# * complete analyse.txt and seperated
|
||||||
# * clean up code
|
# * clean up code
|
||||||
# * do more testing
|
# * do more testing
|
||||||
# * add random useragent for google and website pdf gathering
|
# * fine tune google search
|
||||||
|
# * add random timeout for new requests
|
||||||
|
# -> maybe not necessary, gs has it ...
|
||||||
|
# -> sort of necessary, on the other hand use proxychains man
|
||||||
|
# * uh oh some fancy c0l0rs
|
||||||
|
# * catch filename to long thingy
|
||||||
#
|
#
|
||||||
# Done
|
# Done
|
||||||
# * add decryption routine
|
# * add decryption routine
|
||||||
# * catch ssl exceptions
|
# * catch ssl exceptions
|
||||||
|
# * add random useragent for google and website pdf gathering
|
||||||
|
# * set option for certificate verification, default is true
|
||||||
|
# * catch conn refused connections
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
import requests
|
||||||
|
import urllib
|
||||||
|
|
||||||
from IPython import embed
|
from IPython import embed
|
||||||
|
|
||||||
from PyPDF2 import pdf
|
from PyPDF2 import pdf
|
||||||
import googlesearch as gs
|
import googlesearch as gs
|
||||||
|
|
||||||
_name_ = 'pdfgrab'
|
name = 'pdfgrab'
|
||||||
_version_ = '0.3'
|
version = '0.4'
|
||||||
_author_ = 'dash'
|
author = 'dash'
|
||||||
_date_ = '2019'
|
date = '2019'
|
||||||
|
|
||||||
def url_strip(url):
|
def url_strip(url):
|
||||||
url = url.rstrip("\n")
|
url = url.rstrip("\n")
|
||||||
url = url.rstrip("\r")
|
url = url.rstrip("\r")
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def get_random_agent():
|
||||||
|
return (gs.get_random_user_agent())
|
||||||
|
|
||||||
def get_DocInfo(filename, filehandle):
|
def get_DocInfo(filename, filehandle):
|
||||||
|
''' the easy way to extract metadata
|
||||||
|
'''
|
||||||
|
|
||||||
fh = filehandle
|
fh = filehandle
|
||||||
try:
|
try:
|
||||||
@@ -117,10 +127,14 @@ def make_directory(outdir):
|
|||||||
#print("[W] mkdir, some error, directory probably exists")
|
#print("[W] mkdir, some error, directory probably exists")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def download_pdf(url, header_data):
|
def download_pdf(url, args, header_data):
|
||||||
''' downloading the pdfile for later analysis '''
|
''' downloading the pdfile for later analysis '''
|
||||||
|
|
||||||
|
# check the remote tls certificate or not?
|
||||||
|
cert_check = args.cert_check
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = requests.get(url,headers=header_data,verify=True)
|
req = requests.get(url,headers=header_data,verify=cert_check)
|
||||||
#req = requests.get(url,headers=header_data,verify=False)
|
#req = requests.get(url,headers=header_data,verify=False)
|
||||||
data = req.content
|
data = req.content
|
||||||
except requests.exceptions.SSLError as e:
|
except requests.exceptions.SSLError as e:
|
||||||
@@ -134,8 +148,13 @@ def download_pdf(url, header_data):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def store_pdf(url,data,outdir):
|
def store_pdf(url,data,outdir):
|
||||||
''' storing the downloaded pdf data '''
|
''' storing the downloaded pdf data
|
||||||
|
'''
|
||||||
name = find_name(url)
|
name = find_name(url)
|
||||||
|
|
||||||
|
# only allow stored file a name with 50 chars
|
||||||
|
name = name[:49] + '.pdf'
|
||||||
|
print(len(name))
|
||||||
save = "%s/%s" % (outdir,name)
|
save = "%s/%s" % (outdir,name)
|
||||||
try:
|
try:
|
||||||
f = open(save,"wb")
|
f = open(save,"wb")
|
||||||
@@ -153,56 +172,54 @@ def store_pdf(url,data,outdir):
|
|||||||
def _parse_pdf(filename):
|
def _parse_pdf(filename):
|
||||||
''' the real parsing function '''
|
''' the real parsing function '''
|
||||||
|
|
||||||
check_encryption(filename)
|
ret = check_encryption(filename)
|
||||||
return True
|
return ret
|
||||||
|
|
||||||
print('[+] Opening %s' % filename)
|
def grab_url(url, args, outdir):
|
||||||
pdfile = open(filename,'rb')
|
|
||||||
|
|
||||||
try:
|
|
||||||
h = pdf.PdfFileReader(pdfile)
|
|
||||||
except pdf.utils.PdfReadError as e:
|
|
||||||
print('[-] Error: %s' % (e))
|
|
||||||
return
|
|
||||||
|
|
||||||
return pdfile
|
|
||||||
|
|
||||||
|
|
||||||
def parse_single_pdf(filename):
|
|
||||||
''' single parse function '''
|
|
||||||
return 123
|
|
||||||
|
|
||||||
def grab_url(url, outdir):
|
|
||||||
''' function keeping all the steps for the user call of grabbing
|
''' function keeping all the steps for the user call of grabbing
|
||||||
just one pdf and analysing it
|
just one pdf and analysing it
|
||||||
'''
|
'''
|
||||||
data = download_pdf(url,None)
|
header_data={'User-Agent':get_random_agent()}
|
||||||
|
data = download_pdf(url,args, header_data)
|
||||||
if data != -1:
|
if data != -1:
|
||||||
savepath = store_pdf(url, data, outdir)
|
savepath = store_pdf(url, data, outdir)
|
||||||
_parse_pdf(savepath)
|
_parse_pdf(savepath)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def seek_and_analyse(search,sargs,outdir):
|
def seek_and_analyse(search,args,outdir):
|
||||||
''' function for keeping all the steps of searching for pdfs and analysing
|
''' function for keeping all the steps of searching for pdfs and analysing
|
||||||
them together
|
them together
|
||||||
'''
|
'''
|
||||||
urls = search_pdf(search,sargs)
|
# use the search function of googlesearch to get the results
|
||||||
for url in urls:
|
urls = search_pdf(search,args)
|
||||||
grab_url(url,outdir)
|
|
||||||
|
|
||||||
def search_pdf(search, sargs):
|
|
||||||
|
# *if* we get an answer
|
||||||
|
if urls != -1:
|
||||||
|
# process through the list and get the pdfs
|
||||||
|
for url in urls:
|
||||||
|
grab_url(url,args,outdir)
|
||||||
|
|
||||||
|
def search_pdf(search, args):
|
||||||
''' the function where googlesearch from mario vilas
|
''' the function where googlesearch from mario vilas
|
||||||
is called
|
is called
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
search_stop = args.search_stop
|
||||||
|
|
||||||
query='%s filetype:pdf' % search
|
query='%s filetype:pdf' % search
|
||||||
#print(query)
|
#print(query)
|
||||||
urls = []
|
urls = []
|
||||||
for url in gs.search(query,num=20,stop=sargs):
|
|
||||||
print(url)
|
|
||||||
urls.append(url)
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
|
||||||
|
print(url)
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print('Error: %s' % e)
|
||||||
|
return -1
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def run(args):
|
def run(args):
|
||||||
@@ -217,7 +234,7 @@ def run(args):
|
|||||||
if args.url_single:
|
if args.url_single:
|
||||||
url = args.url_single
|
url = args.url_single
|
||||||
print('[+] Grabbing %s' % (url))
|
print('[+] Grabbing %s' % (url))
|
||||||
grab_url(url, outdir)
|
grab_url(url, args,outdir)
|
||||||
|
|
||||||
elif args.file_single:
|
elif args.file_single:
|
||||||
pdffile = args.file_single
|
pdffile = args.file_single
|
||||||
@@ -226,10 +243,9 @@ def run(args):
|
|||||||
|
|
||||||
elif args.search:
|
elif args.search:
|
||||||
search = args.search
|
search = args.search
|
||||||
sargs = args.search_stop
|
|
||||||
#print(args)
|
#print(args)
|
||||||
print('[+] Seek and de...erm...analysing %s' % (search))
|
print('[+] Seek and de...erm...analysing %s' % (search))
|
||||||
seek_and_analyse(search,sargs,outdir)
|
seek_and_analyse(search,args,outdir)
|
||||||
|
|
||||||
elif args.files_dir:
|
elif args.files_dir:
|
||||||
directory = args.files_dir
|
directory = args.files_dir
|
||||||
@@ -239,23 +255,26 @@ def run(args):
|
|||||||
fpath = '%s/%s' % (directory,f)
|
fpath = '%s/%s' % (directory,f)
|
||||||
_parse_pdf(fpath)
|
_parse_pdf(fpath)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('[-] Dunno what to do, bro.')
|
print('[-] Dunno what to do, bro.')
|
||||||
#logfile = "%s/%s.txt" % (out,out)
|
|
||||||
#flog = open(logfile,"w")
|
|
||||||
|
return 42
|
||||||
|
# This is the end my friend.
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser_desc = "%s %s %s" % (_name_,_version_,_author_)
|
parser_desc = "%s %s %s in %s" % (name,version,author,date)
|
||||||
parser = argparse.ArgumentParser(prog = __name__, description=parser_desc)
|
parser = argparse.ArgumentParser(prog = name, description=parser_desc)
|
||||||
parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
|
parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
|
||||||
|
# parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
|
||||||
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
|
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
|
||||||
|
#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
|
||||||
|
#########
|
||||||
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
|
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
|
||||||
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
|
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
|
||||||
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
|
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
|
||||||
parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
|
parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
|
||||||
|
parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run(args)
|
run(args)
|
||||||
|
|||||||
Reference in New Issue
Block a user