release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list

This commit is contained in:
c0decave
2019-11-05 14:42:24 +01:00
parent fa3b925d6f
commit e1d7c3f760
7 changed files with 476 additions and 388 deletions

0
libs/__init__.py Normal file
View File

30
libs/libgoogle.py Normal file
View File

@@ -0,0 +1,30 @@
import googlesearch as gs
import urllib
from libs.libhelper import *
def get_random_agent():
return (gs.get_random_user_agent())
def search_pdf(search, args):
''' the function where googlesearch from mario vilas
is called
'''
search_stop = args.search_stop
query = '%s filetype:pdf' % search
# print(query)
urls = []
try:
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
#print(url)
urls.append(url)
except urllib.error.HTTPError as e:
print('Error: %s' % e)
return -1
return urls

37
libs/libhelper.py Normal file
View File

@@ -0,0 +1,37 @@
import os
import sys
from Crypto.Hash import SHA256
def make_directory(outdir):
''' naive mkdir function '''
try:
os.mkdir(outdir)
except:
# print("[W] mkdir, some error, directory probably exists")
pass
def url_strip(url):
url = url.rstrip("\n")
url = url.rstrip("\r")
return url
def create_sha256(hdata):
''' introduced to create hashes of filenames, to have a uniqid
of course hashes of the file itself will be the next topic
'''
hobject = SHA256.new(data=hdata.encode())
return (hobject.hexdigest())
def find_name(pdf):
''' simply parses the urlencoded name and extracts the storage name
i would not be surprised this naive approach can lead to fuckups
'''
# find the name of the file
name = pdf.split("/")
a = len(name)
name = name[a - 1]
# print(name)
return name

17
libs/liblog.py Normal file
View File

@@ -0,0 +1,17 @@
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('pdfgrab.log')
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)