From d56ca14cb0998495a468688e8c5512074966a412 Mon Sep 17 00:00:00 2001 From: dash Date: Thu, 26 Sep 2019 16:41:37 +0200 Subject: [PATCH] first public commit --- Readme.md | 117 +++++++++++++++++++++++++++++++++ pdfgrab.py | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 Readme.md create mode 100755 pdfgrab.py diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..8f4e842 --- /dev/null +++ b/Readme.md @@ -0,0 +1,117 @@ +# pdfgrab + +## What is it? + +This is a reborn tool, used during the epoche dinosaurs were traipsing the earth. +Basically it analyses PDF files for Metadata. You can direct it to a file or directory with pdfs. +You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas) class +to search for pdfs at target site, download and analyse them + +## What information can be gathered? + +This depends on the software used to create the pdf. And if it has been cleaned. +However, common are the following things: + +* Producer +* Creator +* CreationDate +* ModificationDate +* Author +* Title +* Subject + +and some more :) + +## How does it work? + +Every more complex filetype above .txt or alike uses metadata for convinience, customer support or only to spread it has been used. +There is a lot information about metadata in different sort of files like pictures, documents, videos, music online. This tool +focuses on pdf only. +If you are new to that term have a look here: +https://en.wikipedia.org/wiki/Metadata + + +## Usage + +Those are your options major options: +* grab pdf from url and analyse +* search site for pdfs via google, grab and analyse +* analyse a local pdf +* analyse a local directory with pdfs in it + +### Single Url Mode + +``` +# ./pdfgrab.py -u https://www.kernel.org/doc/mirror/ols2004v2.pdf +``` +Result: +``` +[+] Grabbing https://www.kernel.org/doc/mirror/ols2004v2.pdf +[+] Written 3893173 bytes for File: pdfgrab/ols2004v2.pdf +[+] Opening pdfgrab/ols2004v2.pdf +-------------------------------------------------------------------------------- +File: pdfgrab/ols2004v2.pdf +/Producer pdfTeX-0.14h +/Creator TeX +/CreationDate D:20040714015300 +-------------------------------------------------------------------------------- +``` +### Single File Mode + +``` +# ./pdfgrab.py -f pdfgrab/ols2004v2.pdf +``` +Result: +``` +[+] Parsing pdfgrab/ols2004v2.pdf +[+] Opening pdfgrab/ols2004v2.pdf +-------------------------------------------------------------------------------- +File: pdfgrab/ols2004v2.pdf +/Producer pdfTeX-0.14h +/Creator TeX +/CreationDate D:20040714015300 +-------------------------------------------------------------------------------- +``` + +### Google Search Mode +``` +# ./pdfgrab.py -s site:kernel.org +``` +Result: +``` +[+] Seek and analysing site:kernel.org +http://vger.kernel.org/lpc_bpf2018_talks/bpf_global_data_and_static_keys.pdf +http://vger.kernel.org/netconf2018_files/JiriPirko_netconf2018.pdf +http://vger.kernel.org/netconf2018_files/PaoloAbeni_netconf2018.pdf +http://vger.kernel.org/lpc_net2018_talks/LPC_XDP_Shirokov_paper_v1.pdf +http://vger.kernel.org/netconf2018_files/FlorianFainelli_netconf2018.pdf +http://vger.kernel.org/lpc_net2018_talks/tc_sw_paper.pdf +https://www.kernel.org/doc/mirror/ols2009.pdf +https://www.kernel.org/doc/mirror/ols2004v2.pdf +http://vger.kernel.org/lpc_net2018_talks/ktls_bpf.pdf +http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf + +[+] Written 211391 bytes for File: pdfgrab/bpf_global_data_and_static_keys.pdf +[+] Opening pdfgrab/bpf_global_data_and_static_keys.pdf +-------------------------------------------------------------------------------- +File: pdfgrab/bpf_global_data_and_static_keys.pdf +/Author +/Title +/Subject +/Creator LaTeX with Beamer class version 3.36 +/Producer pdfTeX-1.40.17 +/Keywords +/CreationDate D:20181102231821+01'00' +/ModDate D:20181102231821+01'00' +/Trapped /False +/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2 +``` + +## Google + +Search: filetype:pdf site:com +Results: 264.000.000 + +## Disclaimer + +Have fun! diff --git a/pdfgrab.py b/pdfgrab.py new file mode 100755 index 0000000..517b92d --- /dev/null +++ b/pdfgrab.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +##################### +# yay - old tool adjusted for python3, using googlesearch now +# and not some self crafted f00 +# +# new features, new layout, new new :> +# dash in end of September 2019 +# +# +# TODO +# * json output +# * txt output + +import os +import sys +import argparse +import requests + +from IPython import embed + +from PyPDF2 import pdf +import googlesearch as gs + +_name_ = 'pdfgrab' +_version_ = '0.3' +_author_ = 'dash' +_date_ = '2019' + +def url_strip(url): + url = url.rstrip("\n") + url = url.rstrip("\r") + return url + +def find_name(pdf): + ''' simply parses the urlencoded name and extracts the storage name + i would not be surprised this naive approach can lead to fuckups + ''' + + #find the name of the file + name = pdf.split("/") + a = len(name) + name = name[a-1] + print(name) + + return name + +def make_directory(outdir): + ''' naive mkdir function ''' + try: + os.mkdir(outdir) + except: + print("[W] mkdir, some error, directory probably exists") + +def download_pdf(url, header_data): + ''' downloading the pdfile for later analysis ''' + req = requests.get(url,headers=header_data) + data = req.content + #data = req.text + print(len(data)) + return data + +def store_pdf(url,data,outdir): + ''' storing the downloaded pdf data ''' + name = find_name(url) + save = "%s/%s" % (outdir,name) + f = open(save,"wb") + ret=f.write(data) + print('[+] Written %d bytes for File: %s' % (ret,save)) + f.close() + + # return the savepath + return save + +def _parse_pdf(filename): + ''' the real parsing function ''' + + print('[+] Opening %s' % filename) + pdfile = open(filename,'rb') + + try: + h = pdf.PdfFileReader(pdfile) + except pdf.utils.PdfReadError as e: + print('[-] Error: %s' % (e)) + return + + extract = h.documentInfo + + print('-'*80) + print('File: %s' % filename) + for k in extract.keys(): + edata = '%s %s' % (k,extract[k]) + print(edata) + print + print('-'*80) + +def parse_single_pdf(filename): + ''' single parse function ''' + return 123 + +def grab_url(url, outdir): + ''' function keeping all the steps for the user call of grabbing + just one pdf and analysing it + ''' + data = download_pdf(url,None) + savepath = store_pdf(url, data, outdir) + _parse_pdf(savepath) + + return + +def seek_and_analyse(search,sargs,outdir): + ''' function for keeping all the steps of searching for pdfs and analysing + them together + ''' + urls = search_pdf(search,sargs) + for url in urls: + grab_url(url,outdir) + +def search_pdf(search, sargs): + ''' the function where googlesearch from mario vilas + is called + ''' + + query='%s filetype:pdf' % search + #print(query) + urls = [] + for url in gs.search(query,stop=10): + print(url) + urls.append(url) + + return urls + +def run(args): + + # specify output directory + outdir = args.outdir + + # create output directory + make_directory(outdir) + + # lets see what the object is + if args.url_single: + url = args.url_single + print('[+] Grabbing %s' % (url)) + grab_url(url, outdir) + + elif args.file_single: + pdffile = args.file_single + print('[+] Parsing %s' % (pdffile)) + _parse_pdf(pdffile) + + elif args.search: + search = args.search + print(args) + print('[+] Seek and de...erm...analysing %s' % (search)) + sargs=10 + seek_and_analyse(search,sargs,outdir) + + elif args.files_dir: + directory = args.files_dir + print('[+] Analyse pdfs in directory %s' % (directory)) + files = os.listdir(directory) + for f in files: + fpath = '%s/%s' % (directory,f) + _parse_pdf(fpath) + + + + + else: + print('[-] Dunno what to do, bro.') + #logfile = "%s/%s.txt" % (out,out) + #flog = open(logfile,"w") + +def main(): + parser_desc = "%s %s %s" % (_name_,_version_,_author_) + parser = argparse.ArgumentParser(prog = __name__, description=parser_desc) + parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab') + parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None) + parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None) + parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None) + parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None) + parser.add_argument('-sn','--search-number',action='store',dest='search_number',required=False,help="specify how many files are searched",default=10,type=int) + + args = parser.parse_args() + run(args) + +if __name__ == "__main__": + main()