release of version 0.4.7 added html reporting, added logging, reordered libraries, added experimental xmp meta data, fixed bug introduced due xmp meta data, added todo list
This commit is contained in:
0
libs/__init__.py
Normal file
0
libs/__init__.py
Normal file
30
libs/libgoogle.py
Normal file
30
libs/libgoogle.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import googlesearch as gs
|
||||
import urllib
|
||||
from libs.libhelper import *
|
||||
|
||||
def get_random_agent():
|
||||
return (gs.get_random_user_agent())
|
||||
|
||||
def search_pdf(search, args):
|
||||
''' the function where googlesearch from mario vilas
|
||||
is called
|
||||
'''
|
||||
|
||||
search_stop = args.search_stop
|
||||
|
||||
query = '%s filetype:pdf' % search
|
||||
# print(query)
|
||||
urls = []
|
||||
|
||||
try:
|
||||
for url in gs.search(query, num=20, stop=search_stop, user_agent=gs.get_random_user_agent()):
|
||||
#print(url)
|
||||
urls.append(url)
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
print('Error: %s' % e)
|
||||
return -1
|
||||
|
||||
|
||||
return urls
|
||||
|
||||
37
libs/libhelper.py
Normal file
37
libs/libhelper.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import sys
|
||||
from Crypto.Hash import SHA256
|
||||
|
||||
def make_directory(outdir):
|
||||
''' naive mkdir function '''
|
||||
try:
|
||||
os.mkdir(outdir)
|
||||
except:
|
||||
# print("[W] mkdir, some error, directory probably exists")
|
||||
pass
|
||||
|
||||
def url_strip(url):
|
||||
url = url.rstrip("\n")
|
||||
url = url.rstrip("\r")
|
||||
return url
|
||||
|
||||
def create_sha256(hdata):
|
||||
''' introduced to create hashes of filenames, to have a uniqid
|
||||
of course hashes of the file itself will be the next topic
|
||||
'''
|
||||
hobject = SHA256.new(data=hdata.encode())
|
||||
return (hobject.hexdigest())
|
||||
|
||||
def find_name(pdf):
|
||||
''' simply parses the urlencoded name and extracts the storage name
|
||||
i would not be surprised this naive approach can lead to fuckups
|
||||
'''
|
||||
|
||||
# find the name of the file
|
||||
name = pdf.split("/")
|
||||
a = len(name)
|
||||
name = name[a - 1]
|
||||
# print(name)
|
||||
|
||||
return name
|
||||
|
||||
17
libs/liblog.py
Normal file
17
libs/liblog.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
file_handler = logging.FileHandler('pdfgrab.log')
|
||||
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.WARNING)
|
||||
|
||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s:%(message)s')
|
||||
|
||||
file_handler.setFormatter(formatter)
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
Reference in New Issue
Block a user