All checks were successful
Gitea Docker Redeploy / Redploy-App-on-self-via-SSH (push) Successful in 19s
177 lines
5.0 KiB
Python
177 lines
5.0 KiB
Python
import os
|
|
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
#from selenium.webdriver.chrome.options import Options
|
|
|
|
from webdriver_manager.firefox import GeckoDriverManager
|
|
from selenium.webdriver.firefox.service import Service as FirefoxService
|
|
|
|
|
|
from tempfile import mkdtemp
|
|
from time import sleep
|
|
from bs4 import BeautifulSoup
|
|
|
|
#element = driver.find_element_by_xpath("//div[@class='blockUI blockOverlay']")
|
|
#wait.until(EC.invisibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
|
|
#ele = WebDriverWait(browser, 10).until(
|
|
# EC.presence_of_element_located((By.ID, "myDynamicElement"))
|
|
#)
|
|
#print( browser.title )
|
|
#ele = browser.find_element_by_css_selector(".myclass")
|
|
#ele.get_attribute("href")
|
|
#ele.send_keys("test")
|
|
#ele.send_keys(Keys.RETURN)
|
|
#ele.click()
|
|
|
|
def innerHTML(element):
|
|
"""
|
|
Returns the inner HTML of an element as a UTF-8 encoded bytestring
|
|
"""
|
|
return element.encode_contents()
|
|
|
|
def get_elements_by_tag_name(ele, tag_name):
|
|
return ele.find_elements(By.TAG_NAME, tag_name)
|
|
|
|
def get_children(ele):
|
|
return ele.find_elements(By.XPATH, "./child::*")
|
|
|
|
|
|
class Bot:
|
|
|
|
def __init__(self, display=False):
|
|
self.__display = display
|
|
self.__current_url = None
|
|
|
|
def __enter__(self):
|
|
if not self.__display:
|
|
os.environ['MOZ_HEADLESS'] = '1'
|
|
|
|
#firefox_executable_path = '/usr/local/bin/geckodriver'
|
|
#firefox_service = webdriver.firefox.service.Service()
|
|
#options = webdriver.FirefoxOptions()
|
|
#driver = webdriver.Firefox(service=firefox_service, options=firefox_options)
|
|
|
|
service = FirefoxService(executable_path=GeckoDriverManager().install())
|
|
|
|
self.__browser = webdriver.Firefox(service=service)
|
|
self.__browser.implicitly_wait(5000)
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
#driver.quit()
|
|
self.__browser.close()
|
|
|
|
def click(self, ele):
|
|
if self.__current_url is None:
|
|
raise Exception("No URL set! No DOM to affect!")
|
|
self.__browser.execute_script("arguments[0].click()", ele)
|
|
|
|
def click_id(self, id):
|
|
self.click(self.__browser.find_element("id", id))
|
|
|
|
def set_url(self, url):
|
|
self.__browser.get(url)
|
|
self.__current_url = url
|
|
|
|
#DOM methods
|
|
def get_elements_by_class_name(self, cls_name):
|
|
return self.__browser.find_elements(By.CLASS_NAME, cls_name)
|
|
|
|
def get_elements_by_tag_name(self, tag_name):
|
|
return self.__browser.find_elements(By.TAG_NAME, tag_name)
|
|
|
|
def get_elements_by_xpath(self, path):
|
|
return self.__browser.find_elements(By.XPATH, path)
|
|
|
|
|
|
def get_page_content(self):
|
|
#WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
|
|
return self.__browser.execute_script("return document.documentElement.innerHTML")
|
|
|
|
|
|
def get_page_soup(self):
|
|
return BeautifulSoup(self.get_page_content(), "html.parser")
|
|
|
|
|
|
def sleep(self, t):
|
|
sleep(t)
|
|
|
|
def scroll(self, d=250):
|
|
#if self.__current_url is None:
|
|
# raise Exception("No URL set! No DOM to affect!")
|
|
self.__browser.execute_script(f"window.scrollBy(0,{d})")
|
|
|
|
|
|
def collect_pagination_items(bot, start_url, next_page, get_nr_pages, get_items, kill_cookie_questions=lambda: None):
|
|
"""
|
|
Collect all the content of a pagination
|
|
"""
|
|
bot.set_url(start_url)
|
|
kill_cookie_questions()
|
|
bot.sleep(4)
|
|
nr_pages = get_nr_pages()
|
|
bot.sleep(2)
|
|
results = []
|
|
for page_nr in range(nr_pages):
|
|
#print("Page %s..." % (page_nr + 1))
|
|
for item in get_items():
|
|
results.append(item)
|
|
bot.sleep(0.5)
|
|
next_page()
|
|
bot.sleep(2)
|
|
|
|
return results
|
|
|
|
"""
|
|
#ele.text
|
|
#ele = browser.find_element_by_id("")
|
|
ele = browser.find_element_by_name("s")
|
|
#ele = browser.find_element_by_css_selector(".myclass")
|
|
#ele.get_attribute("href")
|
|
|
|
print(ele)
|
|
|
|
ele.send_keys("test")
|
|
ele.send_keys(Keys.RETURN)
|
|
|
|
|
|
def getKeiserHeadlines():
|
|
url = "http://maxkeiser.com/"
|
|
soup = BeautifulSoup(readUrl(url), "html.parser")
|
|
for h1 in soup.findAll("h1"):
|
|
if "post-title" in h1["class"]:
|
|
if h1.a.string != None:
|
|
print(h1.a.string)
|
|
|
|
soup = BeautifulSoup(readUrl(url), "html.parser")
|
|
for ele in soup.findAll("title"):
|
|
return ele.string
|
|
|
|
soup = BeautifulSoup(getSiteContent(url))
|
|
ls = []
|
|
for i in soup.findAll(tagName):
|
|
if i.get("class") == className:
|
|
if link:
|
|
if i.a.string != None:
|
|
ls.append(i.a.string)
|
|
else:
|
|
ls.append(i.string)
|
|
return ls
|
|
|
|
|
|
def getWeltHeadLines():
|
|
soup = BeautifulSoup(getSiteContent("http://welt.de"))
|
|
ls = []
|
|
for i in soup.findAll("h4"):
|
|
if i.get("class") == "headline":
|
|
if i.a.string != None:
|
|
ls.append(i.a.string)
|
|
return ls
|
|
|
|
"""
|