first public commit

This commit is contained in:
dash
2019-09-26 16:41:37 +02:00
parent 6a31b372e0
commit d56ca14cb0
2 changed files with 305 additions and 0 deletions

117
Readme.md Normal file
View File

@@ -0,0 +1,117 @@
# pdfgrab
## What is it?
This is a reborn tool, used during the epoche dinosaurs were traipsing the earth.
Basically it analyses PDF files for Metadata. You can direct it to a file or directory with pdfs.
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas) class
to search for pdfs at target site, download and analyse them
## What information can be gathered?
This depends on the software used to create the pdf. And if it has been cleaned.
However, common are the following things:
* Producer
* Creator
* CreationDate
* ModificationDate
* Author
* Title
* Subject
and some more :)
## How does it work?
Every more complex filetype above .txt or alike uses metadata for convinience, customer support or only to spread it has been used.
There is a lot information about metadata in different sort of files like pictures, documents, videos, music online. This tool
focuses on pdf only.
If you are new to that term have a look here:
https://en.wikipedia.org/wiki/Metadata
## Usage
Those are your options major options:
* grab pdf from url and analyse
* search site for pdfs via google, grab and analyse
* analyse a local pdf
* analyse a local directory with pdfs in it
### Single Url Mode
```
# ./pdfgrab.py -u https://www.kernel.org/doc/mirror/ols2004v2.pdf
```
Result:
```
[+] Grabbing https://www.kernel.org/doc/mirror/ols2004v2.pdf
[+] Written 3893173 bytes for File: pdfgrab/ols2004v2.pdf
[+] Opening pdfgrab/ols2004v2.pdf
--------------------------------------------------------------------------------
File: pdfgrab/ols2004v2.pdf
/Producer pdfTeX-0.14h
/Creator TeX
/CreationDate D:20040714015300
--------------------------------------------------------------------------------
```
### Single File Mode
```
# ./pdfgrab.py -f pdfgrab/ols2004v2.pdf
```
Result:
```
[+] Parsing pdfgrab/ols2004v2.pdf
[+] Opening pdfgrab/ols2004v2.pdf
--------------------------------------------------------------------------------
File: pdfgrab/ols2004v2.pdf
/Producer pdfTeX-0.14h
/Creator TeX
/CreationDate D:20040714015300
--------------------------------------------------------------------------------
```
### Google Search Mode
```
# ./pdfgrab.py -s site:kernel.org
```
Result:
```
[+] Seek and analysing site:kernel.org
http://vger.kernel.org/lpc_bpf2018_talks/bpf_global_data_and_static_keys.pdf
http://vger.kernel.org/netconf2018_files/JiriPirko_netconf2018.pdf
http://vger.kernel.org/netconf2018_files/PaoloAbeni_netconf2018.pdf
http://vger.kernel.org/lpc_net2018_talks/LPC_XDP_Shirokov_paper_v1.pdf
http://vger.kernel.org/netconf2018_files/FlorianFainelli_netconf2018.pdf
http://vger.kernel.org/lpc_net2018_talks/tc_sw_paper.pdf
https://www.kernel.org/doc/mirror/ols2009.pdf
https://www.kernel.org/doc/mirror/ols2004v2.pdf
http://vger.kernel.org/lpc_net2018_talks/ktls_bpf.pdf
http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf
[+] Written 211391 bytes for File: pdfgrab/bpf_global_data_and_static_keys.pdf
[+] Opening pdfgrab/bpf_global_data_and_static_keys.pdf
--------------------------------------------------------------------------------
File: pdfgrab/bpf_global_data_and_static_keys.pdf
/Author
/Title
/Subject
/Creator LaTeX with Beamer class version 3.36
/Producer pdfTeX-1.40.17
/Keywords
/CreationDate D:20181102231821+01'00'
/ModDate D:20181102231821+01'00'
/Trapped /False
/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
```
## Google
Search: filetype:pdf site:com
Results: 264.000.000
## Disclaimer
Have fun!

188
pdfgrab.py Executable file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
#####################
# yay - old tool adjusted for python3, using googlesearch now
# and not some self crafted f00
#
# new features, new layout, new new :>
# dash in end of September 2019
#
#
# TODO
# * json output
# * txt output
import os
import sys
import argparse
import requests
from IPython import embed
from PyPDF2 import pdf
import googlesearch as gs
_name_ = 'pdfgrab'
_version_ = '0.3'
_author_ = 'dash'
_date_ = '2019'
def url_strip(url):
url = url.rstrip("\n")
url = url.rstrip("\r")
return url
def find_name(pdf):
''' simply parses the urlencoded name and extracts the storage name
i would not be surprised this naive approach can lead to fuckups
'''
#find the name of the file
name = pdf.split("/")
a = len(name)
name = name[a-1]
print(name)
return name
def make_directory(outdir):
''' naive mkdir function '''
try:
os.mkdir(outdir)
except:
print("[W] mkdir, some error, directory probably exists")
def download_pdf(url, header_data):
''' downloading the pdfile for later analysis '''
req = requests.get(url,headers=header_data)
data = req.content
#data = req.text
print(len(data))
return data
def store_pdf(url,data,outdir):
''' storing the downloaded pdf data '''
name = find_name(url)
save = "%s/%s" % (outdir,name)
f = open(save,"wb")
ret=f.write(data)
print('[+] Written %d bytes for File: %s' % (ret,save))
f.close()
# return the savepath
return save
def _parse_pdf(filename):
''' the real parsing function '''
print('[+] Opening %s' % filename)
pdfile = open(filename,'rb')
try:
h = pdf.PdfFileReader(pdfile)
except pdf.utils.PdfReadError as e:
print('[-] Error: %s' % (e))
return
extract = h.documentInfo
print('-'*80)
print('File: %s' % filename)
for k in extract.keys():
edata = '%s %s' % (k,extract[k])
print(edata)
print
print('-'*80)
def parse_single_pdf(filename):
''' single parse function '''
return 123
def grab_url(url, outdir):
''' function keeping all the steps for the user call of grabbing
just one pdf and analysing it
'''
data = download_pdf(url,None)
savepath = store_pdf(url, data, outdir)
_parse_pdf(savepath)
return
def seek_and_analyse(search,sargs,outdir):
''' function for keeping all the steps of searching for pdfs and analysing
them together
'''
urls = search_pdf(search,sargs)
for url in urls:
grab_url(url,outdir)
def search_pdf(search, sargs):
''' the function where googlesearch from mario vilas
is called
'''
query='%s filetype:pdf' % search
#print(query)
urls = []
for url in gs.search(query,stop=10):
print(url)
urls.append(url)
return urls
def run(args):
# specify output directory
outdir = args.outdir
# create output directory
make_directory(outdir)
# lets see what the object is
if args.url_single:
url = args.url_single
print('[+] Grabbing %s' % (url))
grab_url(url, outdir)
elif args.file_single:
pdffile = args.file_single
print('[+] Parsing %s' % (pdffile))
_parse_pdf(pdffile)
elif args.search:
search = args.search
print(args)
print('[+] Seek and de...erm...analysing %s' % (search))
sargs=10
seek_and_analyse(search,sargs,outdir)
elif args.files_dir:
directory = args.files_dir
print('[+] Analyse pdfs in directory %s' % (directory))
files = os.listdir(directory)
for f in files:
fpath = '%s/%s' % (directory,f)
_parse_pdf(fpath)
else:
print('[-] Dunno what to do, bro.')
#logfile = "%s/%s.txt" % (out,out)
#flog = open(logfile,"w")
def main():
parser_desc = "%s %s %s" % (_name_,_version_,_author_)
parser = argparse.ArgumentParser(prog = __name__, description=parser_desc)
parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
parser.add_argument('-sn','--search-number',action='store',dest='search_number',required=False,help="specify how many files are searched",default=10,type=int)
args = parser.parse_args()
run(args)
if __name__ == "__main__":
main()