first public commit
This commit is contained in:
117
Readme.md
Normal file
117
Readme.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# pdfgrab
|
||||
|
||||
## What is it?
|
||||
|
||||
This is a reborn tool, used during the epoche dinosaurs were traipsing the earth.
|
||||
Basically it analyses PDF files for Metadata. You can direct it to a file or directory with pdfs.
|
||||
You can show it the url of a pdf or use the integrated googlesearch (thanx to mario vilas) class
|
||||
to search for pdfs at target site, download and analyse them
|
||||
|
||||
## What information can be gathered?
|
||||
|
||||
This depends on the software used to create the pdf. And if it has been cleaned.
|
||||
However, common are the following things:
|
||||
|
||||
* Producer
|
||||
* Creator
|
||||
* CreationDate
|
||||
* ModificationDate
|
||||
* Author
|
||||
* Title
|
||||
* Subject
|
||||
|
||||
and some more :)
|
||||
|
||||
## How does it work?
|
||||
|
||||
Every more complex filetype above .txt or alike uses metadata for convinience, customer support or only to spread it has been used.
|
||||
There is a lot information about metadata in different sort of files like pictures, documents, videos, music online. This tool
|
||||
focuses on pdf only.
|
||||
If you are new to that term have a look here:
|
||||
https://en.wikipedia.org/wiki/Metadata
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
Those are your options major options:
|
||||
* grab pdf from url and analyse
|
||||
* search site for pdfs via google, grab and analyse
|
||||
* analyse a local pdf
|
||||
* analyse a local directory with pdfs in it
|
||||
|
||||
### Single Url Mode
|
||||
|
||||
```
|
||||
# ./pdfgrab.py -u https://www.kernel.org/doc/mirror/ols2004v2.pdf
|
||||
```
|
||||
Result:
|
||||
```
|
||||
[+] Grabbing https://www.kernel.org/doc/mirror/ols2004v2.pdf
|
||||
[+] Written 3893173 bytes for File: pdfgrab/ols2004v2.pdf
|
||||
[+] Opening pdfgrab/ols2004v2.pdf
|
||||
--------------------------------------------------------------------------------
|
||||
File: pdfgrab/ols2004v2.pdf
|
||||
/Producer pdfTeX-0.14h
|
||||
/Creator TeX
|
||||
/CreationDate D:20040714015300
|
||||
--------------------------------------------------------------------------------
|
||||
```
|
||||
### Single File Mode
|
||||
|
||||
```
|
||||
# ./pdfgrab.py -f pdfgrab/ols2004v2.pdf
|
||||
```
|
||||
Result:
|
||||
```
|
||||
[+] Parsing pdfgrab/ols2004v2.pdf
|
||||
[+] Opening pdfgrab/ols2004v2.pdf
|
||||
--------------------------------------------------------------------------------
|
||||
File: pdfgrab/ols2004v2.pdf
|
||||
/Producer pdfTeX-0.14h
|
||||
/Creator TeX
|
||||
/CreationDate D:20040714015300
|
||||
--------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
### Google Search Mode
|
||||
```
|
||||
# ./pdfgrab.py -s site:kernel.org
|
||||
```
|
||||
Result:
|
||||
```
|
||||
[+] Seek and analysing site:kernel.org
|
||||
http://vger.kernel.org/lpc_bpf2018_talks/bpf_global_data_and_static_keys.pdf
|
||||
http://vger.kernel.org/netconf2018_files/JiriPirko_netconf2018.pdf
|
||||
http://vger.kernel.org/netconf2018_files/PaoloAbeni_netconf2018.pdf
|
||||
http://vger.kernel.org/lpc_net2018_talks/LPC_XDP_Shirokov_paper_v1.pdf
|
||||
http://vger.kernel.org/netconf2018_files/FlorianFainelli_netconf2018.pdf
|
||||
http://vger.kernel.org/lpc_net2018_talks/tc_sw_paper.pdf
|
||||
https://www.kernel.org/doc/mirror/ols2009.pdf
|
||||
https://www.kernel.org/doc/mirror/ols2004v2.pdf
|
||||
http://vger.kernel.org/lpc_net2018_talks/ktls_bpf.pdf
|
||||
http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf
|
||||
|
||||
[+] Written 211391 bytes for File: pdfgrab/bpf_global_data_and_static_keys.pdf
|
||||
[+] Opening pdfgrab/bpf_global_data_and_static_keys.pdf
|
||||
--------------------------------------------------------------------------------
|
||||
File: pdfgrab/bpf_global_data_and_static_keys.pdf
|
||||
/Author
|
||||
/Title
|
||||
/Subject
|
||||
/Creator LaTeX with Beamer class version 3.36
|
||||
/Producer pdfTeX-1.40.17
|
||||
/Keywords
|
||||
/CreationDate D:20181102231821+01'00'
|
||||
/ModDate D:20181102231821+01'00'
|
||||
/Trapped /False
|
||||
/PTEX.Fullbanner This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2
|
||||
```
|
||||
|
||||
## Google
|
||||
|
||||
Search: filetype:pdf site:com
|
||||
Results: 264.000.000
|
||||
|
||||
## Disclaimer
|
||||
|
||||
Have fun!
|
||||
188
pdfgrab.py
Executable file
188
pdfgrab.py
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
#####################
|
||||
# yay - old tool adjusted for python3, using googlesearch now
|
||||
# and not some self crafted f00
|
||||
#
|
||||
# new features, new layout, new new :>
|
||||
# dash in end of September 2019
|
||||
#
|
||||
#
|
||||
# TODO
|
||||
# * json output
|
||||
# * txt output
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
from IPython import embed
|
||||
|
||||
from PyPDF2 import pdf
|
||||
import googlesearch as gs
|
||||
|
||||
_name_ = 'pdfgrab'
|
||||
_version_ = '0.3'
|
||||
_author_ = 'dash'
|
||||
_date_ = '2019'
|
||||
|
||||
def url_strip(url):
|
||||
url = url.rstrip("\n")
|
||||
url = url.rstrip("\r")
|
||||
return url
|
||||
|
||||
def find_name(pdf):
|
||||
''' simply parses the urlencoded name and extracts the storage name
|
||||
i would not be surprised this naive approach can lead to fuckups
|
||||
'''
|
||||
|
||||
#find the name of the file
|
||||
name = pdf.split("/")
|
||||
a = len(name)
|
||||
name = name[a-1]
|
||||
print(name)
|
||||
|
||||
return name
|
||||
|
||||
def make_directory(outdir):
|
||||
''' naive mkdir function '''
|
||||
try:
|
||||
os.mkdir(outdir)
|
||||
except:
|
||||
print("[W] mkdir, some error, directory probably exists")
|
||||
|
||||
def download_pdf(url, header_data):
|
||||
''' downloading the pdfile for later analysis '''
|
||||
req = requests.get(url,headers=header_data)
|
||||
data = req.content
|
||||
#data = req.text
|
||||
print(len(data))
|
||||
return data
|
||||
|
||||
def store_pdf(url,data,outdir):
|
||||
''' storing the downloaded pdf data '''
|
||||
name = find_name(url)
|
||||
save = "%s/%s" % (outdir,name)
|
||||
f = open(save,"wb")
|
||||
ret=f.write(data)
|
||||
print('[+] Written %d bytes for File: %s' % (ret,save))
|
||||
f.close()
|
||||
|
||||
# return the savepath
|
||||
return save
|
||||
|
||||
def _parse_pdf(filename):
|
||||
''' the real parsing function '''
|
||||
|
||||
print('[+] Opening %s' % filename)
|
||||
pdfile = open(filename,'rb')
|
||||
|
||||
try:
|
||||
h = pdf.PdfFileReader(pdfile)
|
||||
except pdf.utils.PdfReadError as e:
|
||||
print('[-] Error: %s' % (e))
|
||||
return
|
||||
|
||||
extract = h.documentInfo
|
||||
|
||||
print('-'*80)
|
||||
print('File: %s' % filename)
|
||||
for k in extract.keys():
|
||||
edata = '%s %s' % (k,extract[k])
|
||||
print(edata)
|
||||
print
|
||||
print('-'*80)
|
||||
|
||||
def parse_single_pdf(filename):
|
||||
''' single parse function '''
|
||||
return 123
|
||||
|
||||
def grab_url(url, outdir):
|
||||
''' function keeping all the steps for the user call of grabbing
|
||||
just one pdf and analysing it
|
||||
'''
|
||||
data = download_pdf(url,None)
|
||||
savepath = store_pdf(url, data, outdir)
|
||||
_parse_pdf(savepath)
|
||||
|
||||
return
|
||||
|
||||
def seek_and_analyse(search,sargs,outdir):
|
||||
''' function for keeping all the steps of searching for pdfs and analysing
|
||||
them together
|
||||
'''
|
||||
urls = search_pdf(search,sargs)
|
||||
for url in urls:
|
||||
grab_url(url,outdir)
|
||||
|
||||
def search_pdf(search, sargs):
|
||||
''' the function where googlesearch from mario vilas
|
||||
is called
|
||||
'''
|
||||
|
||||
query='%s filetype:pdf' % search
|
||||
#print(query)
|
||||
urls = []
|
||||
for url in gs.search(query,stop=10):
|
||||
print(url)
|
||||
urls.append(url)
|
||||
|
||||
return urls
|
||||
|
||||
def run(args):
|
||||
|
||||
# specify output directory
|
||||
outdir = args.outdir
|
||||
|
||||
# create output directory
|
||||
make_directory(outdir)
|
||||
|
||||
# lets see what the object is
|
||||
if args.url_single:
|
||||
url = args.url_single
|
||||
print('[+] Grabbing %s' % (url))
|
||||
grab_url(url, outdir)
|
||||
|
||||
elif args.file_single:
|
||||
pdffile = args.file_single
|
||||
print('[+] Parsing %s' % (pdffile))
|
||||
_parse_pdf(pdffile)
|
||||
|
||||
elif args.search:
|
||||
search = args.search
|
||||
print(args)
|
||||
print('[+] Seek and de...erm...analysing %s' % (search))
|
||||
sargs=10
|
||||
seek_and_analyse(search,sargs,outdir)
|
||||
|
||||
elif args.files_dir:
|
||||
directory = args.files_dir
|
||||
print('[+] Analyse pdfs in directory %s' % (directory))
|
||||
files = os.listdir(directory)
|
||||
for f in files:
|
||||
fpath = '%s/%s' % (directory,f)
|
||||
_parse_pdf(fpath)
|
||||
|
||||
|
||||
|
||||
|
||||
else:
|
||||
print('[-] Dunno what to do, bro.')
|
||||
#logfile = "%s/%s.txt" % (out,out)
|
||||
#flog = open(logfile,"w")
|
||||
|
||||
def main():
|
||||
parser_desc = "%s %s %s" % (_name_,_version_,_author_)
|
||||
parser = argparse.ArgumentParser(prog = __name__, description=parser_desc)
|
||||
parser.add_argument('-o','--outdir',action='store',dest='outdir',required=False,help="define the outdirectory for downloaded files and analysis output",default='pdfgrab')
|
||||
parser.add_argument('-u','--url',action='store',dest='url_single',required=False,help="grab pdf from specified url for analysis",default=None)
|
||||
parser.add_argument('-f','--file',action='store',dest='file_single',required=False,help="specify local path of pdf for analysis",default=None)
|
||||
parser.add_argument('-F','--files-dir',action='store',dest='files_dir',required=False,help="specify local path of *directory* with pdf *files* for analysis",default=None)
|
||||
parser.add_argument('-s','--search',action='store',dest='search',required=False,help="specify domain or tld to scrape for pdf-files",default=None)
|
||||
parser.add_argument('-sn','--search-number',action='store',dest='search_number',required=False,help="specify how many files are searched",default=10,type=int)
|
||||
|
||||
args = parser.parse_args()
|
||||
run(args)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user