import os from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC #from selenium.webdriver.chrome.options import Options from webdriver_manager.firefox import GeckoDriverManager from selenium.webdriver.firefox.service import Service as FirefoxService from tempfile import mkdtemp from time import sleep from bs4 import BeautifulSoup #element = driver.find_element_by_xpath("//div[@class='blockUI blockOverlay']") #wait.until(EC.invisibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']"))) #ele = WebDriverWait(browser, 10).until( # EC.presence_of_element_located((By.ID, "myDynamicElement")) #) #print( browser.title ) #ele = browser.find_element_by_css_selector(".myclass") #ele.get_attribute("href") #ele.send_keys("test") #ele.send_keys(Keys.RETURN) #ele.click() def innerHTML(element): """ Returns the inner HTML of an element as a UTF-8 encoded bytestring """ return element.encode_contents() def get_elements_by_tag_name(ele, tag_name): return ele.find_elements(By.TAG_NAME, tag_name) def get_children(ele): return ele.find_elements(By.XPATH, "./child::*") class Bot: def __init__(self, display=False): self.__display = display self.__current_url = None def __enter__(self): if not self.__display: os.environ['MOZ_HEADLESS'] = '1' #firefox_executable_path = '/usr/local/bin/geckodriver' #firefox_service = webdriver.firefox.service.Service() #options = webdriver.FirefoxOptions() #driver = webdriver.Firefox(service=firefox_service, options=firefox_options) service = FirefoxService(executable_path=GeckoDriverManager().install()) self.__browser = webdriver.Firefox(service=service) self.__browser.implicitly_wait(5000) return self def __exit__(self, *args): #driver.quit() self.__browser.close() def click(self, ele): if self.__current_url is None: raise Exception("No URL set! No DOM to affect!") self.__browser.execute_script("arguments[0].click()", ele) def click_id(self, id): self.click(self.__browser.find_element("id", id)) def set_url(self, url): self.__browser.get(url) self.__current_url = url #DOM methods def get_elements_by_class_name(self, cls_name): return self.__browser.find_elements(By.CLASS_NAME, cls_name) def get_elements_by_tag_name(self, tag_name): return self.__browser.find_elements(By.TAG_NAME, tag_name) def get_elements_by_xpath(self, path): return self.__browser.find_elements(By.XPATH, path) def get_page_content(self): #WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) return self.__browser.execute_script("return document.documentElement.innerHTML") def get_page_soup(self): return BeautifulSoup(self.get_page_content(), "html.parser") def sleep(self, t): sleep(t) def scroll(self, d=250): #if self.__current_url is None: # raise Exception("No URL set! No DOM to affect!") self.__browser.execute_script(f"window.scrollBy(0,{d})") def collect_pagination_items(bot, start_url, next_page, get_nr_pages, get_items, kill_cookie_questions=lambda: None): """ Collect all the content of a pagination """ bot.set_url(start_url) kill_cookie_questions() bot.sleep(4) nr_pages = get_nr_pages() bot.sleep(2) results = [] for page_nr in range(nr_pages): #print("Page %s..." % (page_nr + 1)) for item in get_items(): results.append(item) bot.sleep(0.5) next_page() bot.sleep(2) return results """ #ele.text #ele = browser.find_element_by_id("") ele = browser.find_element_by_name("s") #ele = browser.find_element_by_css_selector(".myclass") #ele.get_attribute("href") print(ele) ele.send_keys("test") ele.send_keys(Keys.RETURN) def getKeiserHeadlines(): url = "http://maxkeiser.com/" soup = BeautifulSoup(readUrl(url), "html.parser") for h1 in soup.findAll("h1"): if "post-title" in h1["class"]: if h1.a.string != None: print(h1.a.string) soup = BeautifulSoup(readUrl(url), "html.parser") for ele in soup.findAll("title"): return ele.string soup = BeautifulSoup(getSiteContent(url)) ls = [] for i in soup.findAll(tagName): if i.get("class") == className: if link: if i.a.string != None: ls.append(i.a.string) else: ls.append(i.string) return ls def getWeltHeadLines(): soup = BeautifulSoup(getSiteContent("http://welt.de")) ls = [] for i in soup.findAll("h4"): if i.get("class") == "headline": if i.a.string != None: ls.append(i.a.string) return ls """