cert-check, random-user-agent, catching too many requests

This commit is contained in:
dash
2019-09-26 18:45:02 +02:00
parent fb2dfb1527
commit 64f48eef9a
2 changed files with 76 additions and 57 deletions

View File

@@ -123,14 +123,14 @@ File: pdfgrab/bpf_global_data_and_static_keys.pdf
``` ```
## TODO ## TODO
* json output * json file-output
* txt output * txt file-output
* catch conn refused connections * catch conn refused connections
* set option for certificate verification, default is false * ~~set option for certificate verification, default is true~~
* complete analyse.txt and seperated * complete analyse.txt and seperated
* clean up code * clean up code
* do more testing * do more testing
* add random useragent for google and website pdf gathering * ~~add random useragent for google and website pdf gathering~~
* ~~add decryption routine~~ * ~~add decryption routine~~
* ~~catch ssl exceptions~~ * ~~catch ssl exceptions~~

View File

@@ -4,45 +4,55 @@
# and not some self crafted f00 # and not some self crafted f00
# #
# new features, new layout, new new :> # new features, new layout, new new :>
# dash in end of September 2019 # by dash at the end of September 2019
#
# #
# TODO # TODO
# * json output # * json file output
# * txt output # * txt file output
# * catch conn refused connections
# * set option for certificate verification, default is false
# * complete analyse.txt and seperated # * complete analyse.txt and seperated
# * clean up code # * clean up code
# * do more testing # * do more testing
# * add random useragent for google and website pdf gathering # * fine tune google search
# * add random timeout for new requests
# -> maybe not necessary, gs has it ...
# -> sort of necessary, on the other hand use proxychains man
# * uh oh some fancy c0l0rs
# * catch filename to long thingy
# #
# Done # Done
# * add decryption routine # * add decryption routine
# * catch ssl exceptions # * catch ssl exceptions
# * add random useragent for google and website pdf gathering
# * set option for certificate verification, default is true
# * catch conn refused connections
import os import os
import sys import sys
import argparse import argparse
import requests import requests
import urllib
from IPython import embed from IPython import embed
from PyPDF2 import pdf from PyPDF2 import pdf
import googlesearch as gs import googlesearch as gs
_name_ = 'pdfgrab' name = 'pdfgrab'
_version_ = '0.3' version = '0.4'
_author_ = 'dash' author = 'dash'
_date_ = '2019' date = '2019'
def url_strip(url): def url_strip(url):
url = url.rstrip("\n") url = url.rstrip("\n")
url = url.rstrip("\r") url = url.rstrip("\r")
return url return url
def get_random_agent():
return (gs.get_random_user_agent())
def get_DocInfo(filename, filehandle): def get_DocInfo(filename, filehandle):
''' the easy way to extract metadata
'''
fh = filehandle fh = filehandle
try: try:
@@ -117,10 +127,14 @@ def make_directory(outdir):
#print("[W] mkdir, some error, directory probably exists") #print("[W] mkdir, some error, directory probably exists")
pass pass
def download_pdf(url, header_data): def download_pdf(url, args, header_data):
''' downloading the pdfile for later analysis ''' ''' downloading the pdfile for later analysis '''
# check the remote tls certificate or not?
cert_check = args.cert_check
try: try:
req = requests.get(url,headers=header_data,verify=True) req = requests.get(url,headers=header_data,verify=cert_check)
#req = requests.get(url,headers=header_data,verify=False) #req = requests.get(url,headers=header_data,verify=False)
data = req.content data = req.content
except requests.exceptions.SSLError as e: except requests.exceptions.SSLError as e:
@@ -134,8 +148,13 @@ def download_pdf(url, header_data):
return data return data
def store_pdf(url,data,outdir): def store_pdf(url,data,outdir):
''' storing the downloaded pdf data ''' ''' storing the downloaded pdf data
'''
name = find_name(url) name = find_name(url)
# only allow stored file a name with 50 chars
name = name[:49] + '.pdf'
print(len(name))
save = "%s/%s" % (outdir,name) save = "%s/%s" % (outdir,name)
try: try:
f = open(save,"wb") f = open(save,"wb")
@@ -153,56 +172,54 @@ def store_pdf(url,data,outdir):
def _parse_pdf(filename): def _parse_pdf(filename):
''' the real parsing function ''' ''' the real parsing function '''
check_encryption(filename) ret = check_encryption(filename)
return True return ret
print('[+] Opening %s' % filename) def grab_url(url, args, outdir):
pdfile = open(filename,'rb')
try:
h = pdf.PdfFileReader(pdfile)
except pdf.utils.PdfReadError as e:
print('[-] Error: %s' % (e))
return
return pdfile
def parse_single_pdf(filename):
''' single parse function '''
return 123
def grab_url(url, outdir):
''' function keeping all the steps for the user call of grabbing ''' function keeping all the steps for the user call of grabbing
just one pdf and analysing it just one pdf and analysing it
''' '''
data = download_pdf(url,None) header_data={'User-Agent':get_random_agent()}
data = download_pdf(url,args, header_data)
if data != -1: if data != -1:
savepath = store_pdf(url, data, outdir) savepath = store_pdf(url, data, outdir)
_parse_pdf(savepath) _parse_pdf(savepath)
return return
def seek_and_analyse(search,sargs,outdir): def seek_and_analyse(search,args,outdir):
''' function for keeping all the steps of searching for pdfs and analysing ''' function for keeping all the steps of searching for pdfs and analysing
them together them together
''' '''
urls = search_pdf(search,sargs) # use the search function of googlesearch to get the results
for url in urls: urls = search_pdf(search,args)
grab_url(url,outdir)
def search_pdf(search, sargs):
# *if* we get an answer
if urls != -1:
# process through the list and get the pdfs
for url in urls:
grab_url(url,args,outdir)
def search_pdf(search, args):
''' the function where googlesearch from mario vilas ''' the function where googlesearch from mario vilas
is called is called
''' '''
search_stop = args.search_stop
query='%s filetype:pdf' % search query='%s filetype:pdf' % search
#print(query) #print(query)
urls = [] urls = []
for url in gs.search(query,num=20,stop=sargs):
print(url)
urls.append(url)
try:
for url in gs.search(query,num=20,stop=search_stop,user_agent=gs.get_random_user_agent()):
print(url)
urls.append(url)
except urllib.error.HTTPError as e:
print('Error: %s' % e)
return -1
return urls return urls
def run(args): def run(args):
@@ -217,7 +234,7 @@ def run(args):
if args.url_single: if args.url_single:
url = args.url_single url = args.url_single
print('[+] Grabbing %s' % (url)) print('[+] Grabbing %s' % (url))
grab_url(url, outdir) grab_url(url, args,outdir)
elif args.file_single: elif args.file_single:
pdffile = args.file_single pdffile = args.file_single
@@ -226,10 +243,9 @@ def run(args):
elif args.search: elif args.search:
search = args.search search = args.search
sargs = args.search_stop
#print(args) #print(args)
print('[+] Seek and de...erm...analysing %s' % (search)) print('[+] Seek and de...erm...analysing %s' % (search))
seek_and_analyse(search,sargs,outdir) seek_and_analyse(search,args,outdir)
elif args.files_dir: elif args.files_dir:
directory = args.files_dir directory = args.files_dir
@@ -239,23 +255,26 @@ def run(args):
fpath = '%s/%s' % (directory,f) fpath = '%s/%s' % (directory,f)
_parse_pdf(fpath) _parse_pdf(fpath)
else: else:
print('[-] Dunno what to do, bro.') print('[-] Dunno what to do, bro.')
#logfile = "%s/%s.txt" % (out,out)
#flog = open(logfile,"w")
return 42
# This is the end my friend.
def main(): def main():
parser_desc = "%s %s %s" % (_name_,_version_,_author_) parser_desc = "%s %s %s in %s" % (name,version,author,date)
parser = argparse.ArgumentParser(prog = __name__, description=parser_desc) parser = argparse.ArgumentParser(prog = name, description=parser_desc)
parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') parser.add_argument('-O','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
# parser.add_argument('-o','--outfile',action='store',dest='outfile',required=False,help="define file with analysis output in txt format",default='pdfgrab_analysis.txt')
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None) parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
#parser.add_argument('-U','--url-list',action='store',dest='urls_many',required=False,help="specify txt file with list of pdf urls to grab",default=None)
#########
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None) parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None) parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None) parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int) parser.add_argument('-sn','--search-number',action='store',dest='search_stop',required=False,help="specify how many files are searched",default=10,type=int)
parser.add_argument('-z','--disable-cert-check',action='store_false',dest='cert_check',required=False,help="if the target domain(s) run with old or bad certificates",default=True)
args = parser.parse_args() args = parser.parse_args()
run(args) run(args)