first public commit

2019-09-26 16:41:37 +02:00
parent 6a31b372e0
commit d56ca14cb0
2 changed files with 305 additions and 0 deletions
--- a/Readme.md
+++ b/Readme.md
@@ -0,0 +1,117 @@
+# pdfgrab
+
+## What is it?
+
+This is a reborn tool, used during the epoche dinosaurs were traipsing the earth. 
+Basically it analyses PDF files for Metadata. You can direct it to a file or directory with pdfs. 
+You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas) class
+to search for pdfs at target site, download and analyse them
+
+## What information can be gathered?
+
+This depends on the software used to create the pdf. And if it has been cleaned. 
+However, common are the following things:
+
+* Producer
+* Creator
+* CreationDate
+* ModificationDate
+* Author
+* Title
+* Subject
+
+and some more :)
+
+## How does it work?
+
+Every more complex filetype above .txt or alike uses metadata for convinience, customer support or only to spread it has been used.
+There is a lot information about metadata in different sort of files like pictures, documents, videos, music online. This tool
+focuses on pdf only. 
+If you are new to that term have a look here:
+https://en.wikipedia.org/wiki/Metadata
+
+
+## Usage
+
+Those are your options major options:
+* grab pdf from url and analyse
+* search site for pdfs via google, grab and analyse
+* analyse a local pdf
+* analyse a local directory with pdfs in it
+
+### Single Url Mode
+
+```
+# ./pdfgrab.py -u https://www.kernel.org/doc/mirror/ols2004v2.pdf
+```
+Result:
+```
+[+] Grabbing https://www.kernel.org/doc/mirror/ols2004v2.pdf
+[+] Written 3893173 bytes for File: pdfgrab/ols2004v2.pdf
+[+] Opening pdfgrab/ols2004v2.pdf
+--------------------------------------------------------------------------------
+File: pdfgrab/ols2004v2.pdf
+/Producer pdfTeX-0.14h
+/Creator TeX
+/CreationDate D:20040714015300
+--------------------------------------------------------------------------------
+```
+### Single File Mode
+
+```
+# ./pdfgrab.py -f pdfgrab/ols2004v2.pdf 
+```
+Result:
+```
+[+] Parsing pdfgrab/ols2004v2.pdf
+[+] Opening pdfgrab/ols2004v2.pdf
+--------------------------------------------------------------------------------
+File: pdfgrab/ols2004v2.pdf
+/Producer pdfTeX-0.14h
+/Creator TeX
+/CreationDate D:20040714015300
+--------------------------------------------------------------------------------
+```
+
+### Google Search Mode
+```
+# ./pdfgrab.py -s site:kernel.org
+```
+Result:
+```
+[+] Seek and analysing site:kernel.org
+http://vger.kernel.org/lpc_bpf2018_talks/bpf_global_data_and_static_keys.pdf
+http://vger.kernel.org/netconf2018_files/JiriPirko_netconf2018.pdf
+http://vger.kernel.org/netconf2018_files/PaoloAbeni_netconf2018.pdf
+http://vger.kernel.org/lpc_net2018_talks/LPC_XDP_Shirokov_paper_v1.pdf
+http://vger.kernel.org/netconf2018_files/FlorianFainelli_netconf2018.pdf
+http://vger.kernel.org/lpc_net2018_talks/tc_sw_paper.pdf
+https://www.kernel.org/doc/mirror/ols2009.pdf
+https://www.kernel.org/doc/mirror/ols2004v2.pdf
+http://vger.kernel.org/lpc_net2018_talks/ktls_bpf.pdf
+http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf
+
+[+] Written 211391 bytes for File: pdfgrab/bpf_global_data_and_static_keys.pdf
+[+] Opening pdfgrab/bpf_global_data_and_static_keys.pdf
+--------------------------------------------------------------------------------
+File: pdfgrab/bpf_global_data_and_static_keys.pdf
+/Author 
+/Title 
+/Subject 
+/Creator LaTeX with Beamer class version 3.36
+/Producer pdfTeX-1.40.17
+/Keywords 
+/CreationDate D:20181102231821+01'00'
+/ModDate D:20181102231821+01'00'
+/Trapped /False
+/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
+```
+
+## Google
+
+Search: filetype:pdf site:com
+Results: 264.000.000
+
+## Disclaimer
+
+Have fun!
--- a/pdfgrab.py
+++ b/pdfgrab.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+#####################
+# yay - old tool adjusted for python3, using googlesearch now
+# and not some self crafted f00
+#
+# new features, new layout, new new :>
+# dash in end of September 2019
+# 
+#
+# TODO
+# * json output
+# * txt output
+
+import os
+import sys
+import argparse
+import requests
+
+from IPython import embed
+
+from PyPDF2 import pdf
+import googlesearch as gs
+
+_name_ 		= 'pdfgrab'
+_version_ 	= '0.3'
+_author_	= 'dash'
+_date_		= '2019'
+
+def url_strip(url):
+	url = url.rstrip("\n")
+	url = url.rstrip("\r")
+	return url
+
+def find_name(pdf):
+	''' simply parses the urlencoded name and extracts the storage name
+		i would not be surprised this naive approach can lead to fuckups
+	'''
+
+	#find the name of the file
+	name = pdf.split("/")
+	a = len(name)
+	name = name[a-1]
+	print(name)
+
+	return name
+
+def make_directory(outdir):
+	''' naive mkdir function '''
+	try:
+		os.mkdir(outdir)
+	except:
+		print("[W] mkdir, some error, directory probably exists")
+
+def download_pdf(url, header_data):
+	''' downloading the pdfile for later analysis '''
+	req = requests.get(url,headers=header_data)
+	data = req.content
+	#data = req.text
+	print(len(data))
+	return data
+
+def store_pdf(url,data,outdir):
+	''' storing the downloaded pdf data '''
+	name = find_name(url)
+	save = "%s/%s" % (outdir,name)
+	f = open(save,"wb")
+	ret=f.write(data)
+	print('[+] Written %d bytes for File: %s' % (ret,save))
+	f.close()
+	
+	# return the savepath
+	return save
+
+def _parse_pdf(filename):
+	''' the real parsing function '''
+
+	print('[+] Opening %s' % filename)
+	pdfile = open(filename,'rb')
+
+	try:
+		h = pdf.PdfFileReader(pdfile)
+	except pdf.utils.PdfReadError as e:
+		print('[-] Error: %s' % (e))
+		return
+
+	extract = h.documentInfo
+
+	print('-'*80)
+	print('File: %s' % filename)
+	for k in extract.keys():
+		edata = '%s %s' % (k,extract[k])
+		print(edata)
+		print
+	print('-'*80)
+
+def parse_single_pdf(filename):
+	''' single parse function '''
+	return 123
+
+def grab_url(url, outdir):
+	''' function keeping all the steps for the user call of grabbing 
+		just one pdf and analysing it
+	'''
+	data = download_pdf(url,None)
+	savepath = store_pdf(url, data, outdir)
+	_parse_pdf(savepath)
+
+	return
+
+def seek_and_analyse(search,sargs,outdir):
+	''' function for keeping all the steps of searching for pdfs and analysing
+		them together
+	'''
+	urls = search_pdf(search,sargs)
+	for url in urls:
+		grab_url(url,outdir)
+
+def search_pdf(search, sargs):
+	''' the function where googlesearch from mario vilas
+		is called
+	'''
+
+	query='%s filetype:pdf' % search
+	#print(query)
+	urls = []
+	for url in gs.search(query,stop=10):
+		print(url)
+		urls.append(url)
+	
+	return urls
+
+def run(args):
+
+	# specify output directory
+	outdir = args.outdir
+
+	# create output directory
+	make_directory(outdir)
+
+	# lets see what the object is
+	if args.url_single:
+		url = args.url_single
+		print('[+] Grabbing %s' % (url))
+		grab_url(url, outdir)
+
+	elif args.file_single:
+		pdffile = args.file_single
+		print('[+] Parsing %s' % (pdffile))
+		_parse_pdf(pdffile)
+
+	elif args.search:
+		search = args.search
+		print(args)
+		print('[+] Seek and de...erm...analysing %s' % (search))
+		sargs=10
+		seek_and_analyse(search,sargs,outdir)
+	
+	elif args.files_dir:
+		directory = args.files_dir
+		print('[+] Analyse pdfs in directory %s' % (directory))
+		files = os.listdir(directory)
+		for f in files:
+			fpath = '%s/%s' % (directory,f)
+			_parse_pdf(fpath)
+
+
+		
+
+	else:
+		print('[-] Dunno what to do, bro.')
+	#logfile = "%s/%s.txt" % (out,out)
+	#flog = open(logfile,"w")
+
+def main():
+	parser_desc = "%s %s %s" % (_name_,_version_,_author_)
+	parser = argparse.ArgumentParser(prog = __name__, description=parser_desc)
+	parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
+	parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
+	parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
+	parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
+	parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
+	parser.add_argument('-sn','--search-number',action='store',dest='search_number',required=False,help="specify how many files are searched",default=10,type=int)
+
+	args = parser.parse_args()
+	run(args)
+
+if __name__ == "__main__":
+	main()