Add files via upload

oxyjohan · web-flow · commit fd92464cdafc · 2022-11-04T15:46:12.000+02:00
diff --git a/README.md b/README.md
@@ -1,2 +1,102 @@
-# web-scraping-selenium-python
-Web Scraping with Python Selenium: Tutorial for Beginners
+# Web Scraping with Python Selenium
+
+[<img src="https://img.shields.io/static/v1?label=&message=python&color=brightgreen" />](https://github.com/topics/python) [<img src="https://img.shields.io/static/v1?label=&message=selenium&color=blue" />](https://github.com/topics/selenium) [<img src="https://img.shields.io/static/v1?label=&message=Web%20Scraping&color=important" />](https://github.com/topics/web-scraping)
+- [Installing Selenium](#installing-selenium)
+- [Testing](#testing)
+- [Scraping with Selenium](#scraping-with-selenium)
+
+In this article, we’ll cover an overview of web scraping with Selenium using a real-life example.
+
+For a detailed tutorial on Selenium, see [our blog](https://oxylabs.io/blog/selenium-web-scraping).
+
+## Installing Selenium
+
+1. Create a virtual environment:
+
+```sh
+python3 -m venv .env
+```
+
+2. Install Selenium using pip:
+
+```sh
+pip install selenium
+```
+
+3. Install Selenium Web Driver. See [this page](https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/) for details.
+
+## Testing
+
+With virtual environment activated, enter IDLE by typing in `python3`. Enter the following command on IDLE:
+
+```python
+>>> from selenium.webdriver import Chrome
+
+```
+
+If there are no errors, move on to the next step. If there is an error, ensure that `chromedriver` is added to the PATH.
+
+## Scraping with Selenium
+
+Import required modules as follows:
+
+```python
+from selenium.webdriver import Chrome, ChromeOptions
+from selenium.webdriver.common.by import By
+```
+
+Add the skeleton of the script as follows:
+
+```python
+def get_data(url) -> list:
+   ...
+
+
+def main():
+    ...
+
+if __name__ == '__main__':
+    main()
+```
+
+Create ChromeOptions object and set `headless` to `True`. Use this to create an instance of `Chrome`.
+
+```python
+    browser_options = ChromeOptions()
+    browser_options.headless = True
+
+    driver = Chrome(options=browser_options)
+```
+
+Call the `driver.get` method to load a URL. After that, locate the link for the Humor section by link text and click it:
+
+```python
+    driver.get(url)
+
+    element = driver.find_element(By.LINK_TEXT, "Humor")
+    element.click()
+```
+
+Create a CSS selector to find all books from this page. After that run a loop on the books and find the bookt title, price, stock availability. Use a dictionary to store one book information and add all these dictionaries to a list. See the code below:
+
+```python
+ books = driver.find_elements(By.CSS_SELECTOR, ".product_pod")
+    data = []
+    for book in books:
+        title = book.find_element(By.CSS_SELECTOR, "h3 > a")
+        price = book.find_element(By.CSS_SELECTOR, ".price_color")
+        stock = book.find_element(By.CSS_SELECTOR, ".instock.availability")
+        book_item = {
+            'title': title.get_attribute("title"),
+            'price': price.text,
+            'stock': stock. text
+        }
+        data.append(book_item)
+
+```
+
+Lastly, return the `data` dictionary from this function.
+
+For the complete code, see [main.py](src/main.py).
+
+For a detailed tutorial on Selenium, see [our blog](https://oxylabs.io/blog/selenium-web-scraping).
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,38 @@
+from selenium.webdriver import Chrome, ChromeOptions
+from selenium.webdriver.common.by import By
+
+
+def get_data(url) -> list:
+    browser_options = ChromeOptions()
+    browser_options.headless = True
+    
+    driver = Chrome(options=browser_options)
+    driver.get(url)
+
+    element = driver.find_element(By.LINK_TEXT, "Humor")
+    element.click()
+
+    books = driver.find_elements(By.CSS_SELECTOR, ".product_pod")
+    data = []
+    for book in books:
+        title = book.find_element(By.CSS_SELECTOR, "h3 > a")
+        price = book.find_element(By.CSS_SELECTOR, ".price_color")
+        stock = book.find_element(By.CSS_SELECTOR, ".instock.availability")
+        book_item = {
+            'title': title.get_attribute("title"),
+            'price': price.text,
+            'stock': stock. text
+        }
+        data.append(book_item)
+
+    driver.quit()
+    return data
+
+
+def main():
+    data = get_data("https://books.toscrape.com/")
+    print(data)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -0,0 +1,15 @@
+async-generator==1.10
+attrs==22.1.0
+certifi==2022.9.24
+exceptiongroup==1.0.0
+h11==0.14.0
+idna==3.4
+outcome==1.2.0
+PySocks==1.7.1
+selenium==4.5.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+trio==0.22.0
+trio-websocket==0.9.2
+urllib3==1.26.12
+wsproto==1.2.0