rahulelex
diff --git a/‎README.md
+40-1 b/‎README.md
+40-1
diff --git a/‎chromedriver_linux64/LICENSE.chromedriver
+6,287 b/‎chromedriver_linux64/LICENSE.chromedriver
+6,287
diff --git a/‎chromedriver_linux64/chromedriver
13.2 MB b/‎chromedriver_linux64/chromedriver
13.2 MB
diff --git a/‎config.ini
+10 b/‎config.ini
+10
diff --git a/‎images/image1.png
44.1 KB b/‎images/image1.png
44.1 KB
diff --git a/‎images/image2.png
147 KB b/‎images/image2.png
147 KB
diff --git a/‎requirements.txt
+5 b/‎requirements.txt
+5
diff --git a/‎scrap.py
+190 b/‎scrap.py
+190
@@ -1,2 +1,41 @@
 # scrape-employees-data-from-greythr
-This repository serves as an illustration of how to retrieve and scrape data from a website using Selenium.
+This repository serves as an illustration of how to retrieve and scrape data from Greythr using Selenium.
+
+## Features
+- Automate login on Greythr using employee username and password.
+- Scrape all employees data and store it in xsls file.
+- Construct a bar graph that illustrates the employee count categorized by their respective designations.
+![Employees count](images/image1.png)
+- Generate a Pie chart representing the distribution of employees based on their birth months.
+![Employees birthday](images/image2.png)
+
+## Dependencies
+1. **Python**: Make sure Python is installed on your system. You can download the latest version of Python from the official Python website (https://www.python.org) and follow the installation instructions for your operating system.
+
+2. **pip**: Check if pip is installed by running the following command in your command-line interface or terminal:
+    ```sh
+    pip --version
+    ```
+
+    If pip is not installed, you can install it by following the instructions provided on the official Python website.
+
+3. **Chrome Browser**: Ensure that you have a Chrome web browser installed on your system. [The provided code has been tested on Chrome version 114.0.5735.198 (Official Build) (64-bit)]
+
+4. **ChromeDriver**: Make sure the ChromeDriver version matches the Chrome browser version installed on your system. You can download ChromeDriver from the official ChromeDriver website (https://sites.google.com/a/chromium.org/chromedriver/downloads) and follow the installation instructions. [The driver version has also been included in the repository I provided]
+
+5. **Used libraries**: Once you have fulfilled the above prerequisites, you can **Install the necessary libraries** using pip. Here is the command to install the required libraries:
+    ```sh
+    pip install -r requirements.txt
+    ```
+    
+## Usage
+1. Modify the configuration file (config.ini) by replacing **<YOUR_COMPANY_NAME>** with your company name, **<YOUR_USERNAME>** with your username, and **<YOUR_PASSWORD>** with your Greythr password.
+2. To run the script, use the following command:
+    ```sh
+    python3 scrap.py
+    ```
+## License
+**Free Software, Hell Yeah!**
+
+## Authors
+- [Rahul Gupta](https://github.com/rahulelex)
@@ -0,0 +1,10 @@
+[DEFAULT]
+xlsx_file_name= employee_data.xlsx
+login_url= https://<YOUR_COMPANY_NAME>.greythr.com/
+employee_url= https://<YOUR_COMPANY_NAME>.greythr.com/v3/portal/ess/people/directory
+data_url= https://<YOUR_COMPANY_NAME>.greythr.com/v3/api/employee/list?page=
+page_size= 250
+
+[database]
+username= <YOUR_USERNAME>
+password= <YOUR_PASSWORD>
@@ -0,0 +1,5 @@
+requests
+pandas
+selenium
+matplotlib
+configparser
@@ -0,0 +1,190 @@
+import requests
+import json
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+import configparser
+import os
+import calendar
+import pandas as pd
+import matplotlib.pyplot as plt
+import configparser
+
+
+class DataExporter:
+    def __init__(self):
+        # Create a ConfigParser object
+        self.config = configparser.ConfigParser()
+        # Read the configuration file
+        self.config.read('config.ini')
+
+        # Access the configuration values
+        self.xlsx_file_path = self.config.get('DEFAULT', 'xlsx_file_name')
+        self.username = self.config.get('database', 'username')
+        self.password = self.config.get('database', 'password')
+        self.driver = None
+        self.access_token = None
+        self._ga = None
+        self._gid = None
+        self.max_number_of_pages= 10
+        self.login_url= self.config.get('DEFAULT', 'login_url')
+        self.employee_url= self.config.get('DEFAULT', 'employee_url')
+        self.data_url= self.config.get('DEFAULT', 'data_url')
+        self.page_size= self.config.get('DEFAULT', 'page_size')
+
+
+    def login(self):
+        # Configure Chrome options for headless browsing
+        chrome_options = Options()
+        chrome_options.add_argument('--headless')  # Run Chrome in headless mode
+        chrome_options.add_argument('--no-sandbox')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+        chromedriver_path = 'chromedriver_linux64/chromedriver'
+        service = Service(chromedriver_path)
+        driver = webdriver.Chrome(service=service, options= chrome_options)
+        print('Login process initiated')
+        driver.get(self.login_url)
+        time.sleep(5)
+        
+        username_input = driver.find_element(By.ID, "username")
+        password_input = driver.find_element(By.ID,'password')
+        username_input.send_keys(self.username)
+        password_input.send_keys(self.password)
+
+        button = driver.find_element("css selector", 'button[type="submit"].bg-primary')
+        button.click()
+
+        time.sleep(2)
+
+        # Navigate to the URL
+        driver.get(self.employee_url)
+
+        cookies = driver.get_cookies()
+
+        for x in cookies:
+            name= x.get('name')
+            if name == 'access_token':
+                self.access_token= x.get('value')
+            elif name == '_ga':
+                self._ga= x.get('value')
+            elif name == '_gid':
+                self._gid= x.get('value')
+
+        time.sleep(2)
+
+        if self._ga and self._gid and self.access_token is not None:
+            print("LOGIN SUCCESS")
+        else:
+            print("LOGIN FAILED")
+            exit
+
+    def fetch_data(self):
+        payload = json.dumps({
+        "cat::search": {}
+        })
+
+        headers = {
+        'content-type': 'application/json',
+        'cookie': 'access_token='+self.access_token+'; _ga='+self._ga+'; _gid='+self._gid+'; _dc_gtm_UA-642192-18=1; _hjIncludedInSessionSample=1',
+        }
+
+        session= requests.session()
+
+        if os.path.exists(self.xlsx_file_path):
+            print('File already exists, removing it.')
+            os.remove(self.xlsx_file_path)
+
+        # Create an empty DataFrame
+        df = pd.DataFrame()
+        dfs= []
+
+        for index in range(self.max_number_of_pages):
+            response = session.request("POST", self.data_url+str(index)+'&pageSize='+str(self.page_size), headers=headers, data=payload)
+            y = json.loads(response.text)
+
+            if y.get('results'):
+                for z in y.get('results'):
+                    e_data = {
+                        'Name': z.get('name'),
+                        'DOB': z.get('dob'),
+                        'Designation': z.get('c_designation'),
+                        'Employee No': z.get('employeeno'),
+                        'Employee ID': z.get('employeeid'),
+                        'Email': z.get('email')
+                    }
+                    
+                    # Create a DataFrame from the current iteration data
+                    df = pd.DataFrame([e_data])
+                    
+                    # Append the DataFrame to the list
+                    dfs.append(df)
+
+        session.close()
+
+        # Concatenate all DataFrames in the list
+        df_merged = pd.concat(dfs, ignore_index=True)
+
+        # Write the merged DataFrame to an Excel file
+        df_merged.to_excel(self.xlsx_file_path, index=False)
+
+        print("Data appended to the Excel file successfully.")
+
+        # Read the XLSX file to remove duplicate data
+        df = pd.read_excel(self.xlsx_file_path, engine='openpyxl')
+
+        # Remove duplicate rows based on all columns
+        df.drop_duplicates(inplace=True)
+
+        # Write the updated DataFrame back to the Excel file
+        df.to_excel(self.xlsx_file_path, index=False)
+
+        print('Data cleaned - removed duplicate data')
+
+        
+    def show_charts(self):
+        df = pd.read_excel(self.xlsx_file_path, engine='openpyxl')
+
+        # Fetch the data from the specified column
+        column_data = df['Designation']
+
+        # Count the occurrences of each value
+        value_counts = column_data.value_counts()
+
+        # Display bar graph for column values
+        value_counts.plot(kind='bar')
+        fig1 = plt.figure(1)
+        plt.title('Designation chart')
+        plt.xlabel('Values')
+        plt.ylabel('Count')
+
+        # Pie chart for month of employees born in
+        # Update column with month
+        df['Month'] = pd.to_datetime(df['DOB'], format='%d %b').dt.month
+
+        df['Month'] = df['Month'].replace('', float('nan')).astype(float).astype('Int64')
+
+        # Replace numeric month values with month names
+        df['Month'] = df['Month'].apply(lambda x: calendar.month_name[x] if pd.notnull(x) else '')
+
+        # Count the occurrences of each month
+        month_counts = df['Month'].value_counts()
+
+        # Plot the pie chart
+        fig2 = plt.figure(2)
+        plt.pie(month_counts, labels=month_counts.index, autopct='%1.1f%%')
+        plt.title('Month-wise Birthday discribution')
+
+        # Show both charts
+        plt.show()
+
+
+def main():
+    exporter = DataExporter()
+    exporter.login()
+    exporter.fetch_data()
+    exporter.show_charts()
+
+if __name__ == '__main__':
+    main()