|
| 1 | +import requests |
| 2 | +import json |
| 3 | +import time |
| 4 | +from selenium import webdriver |
| 5 | +from selenium.webdriver.chrome.options import Options |
| 6 | +from selenium.webdriver.chrome.service import Service |
| 7 | +from selenium.webdriver.common.by import By |
| 8 | +import configparser |
| 9 | +import os |
| 10 | +import calendar |
| 11 | +import pandas as pd |
| 12 | +import matplotlib.pyplot as plt |
| 13 | +import configparser |
| 14 | + |
| 15 | + |
| 16 | +class DataExporter: |
| 17 | + def __init__(self): |
| 18 | + # Create a ConfigParser object |
| 19 | + self.config = configparser.ConfigParser() |
| 20 | + # Read the configuration file |
| 21 | + self.config.read('config.ini') |
| 22 | + |
| 23 | + # Access the configuration values |
| 24 | + self.xlsx_file_path = self.config.get('DEFAULT', 'xlsx_file_name') |
| 25 | + self.username = self.config.get('database', 'username') |
| 26 | + self.password = self.config.get('database', 'password') |
| 27 | + self.driver = None |
| 28 | + self.access_token = None |
| 29 | + self._ga = None |
| 30 | + self._gid = None |
| 31 | + self.max_number_of_pages= 10 |
| 32 | + self.login_url= self.config.get('DEFAULT', 'login_url') |
| 33 | + self.employee_url= self.config.get('DEFAULT', 'employee_url') |
| 34 | + self.data_url= self.config.get('DEFAULT', 'data_url') |
| 35 | + self.page_size= self.config.get('DEFAULT', 'page_size') |
| 36 | + |
| 37 | + |
| 38 | + def login(self): |
| 39 | + # Configure Chrome options for headless browsing |
| 40 | + chrome_options = Options() |
| 41 | + chrome_options.add_argument('--headless') # Run Chrome in headless mode |
| 42 | + chrome_options.add_argument('--no-sandbox') |
| 43 | + chrome_options.add_argument('--disable-dev-shm-usage') |
| 44 | + chromedriver_path = 'chromedriver_linux64/chromedriver' |
| 45 | + service = Service(chromedriver_path) |
| 46 | + driver = webdriver.Chrome(service=service, options= chrome_options) |
| 47 | + print('Login process initiated') |
| 48 | + driver.get(self.login_url) |
| 49 | + time.sleep(5) |
| 50 | + |
| 51 | + username_input = driver.find_element(By.ID, "username") |
| 52 | + password_input = driver.find_element(By.ID,'password') |
| 53 | + username_input.send_keys(self.username) |
| 54 | + password_input.send_keys(self.password) |
| 55 | + |
| 56 | + button = driver.find_element("css selector", 'button[type="submit"].bg-primary') |
| 57 | + button.click() |
| 58 | + |
| 59 | + time.sleep(2) |
| 60 | + |
| 61 | + # Navigate to the URL |
| 62 | + driver.get(self.employee_url) |
| 63 | + |
| 64 | + cookies = driver.get_cookies() |
| 65 | + |
| 66 | + for x in cookies: |
| 67 | + name= x.get('name') |
| 68 | + if name == 'access_token': |
| 69 | + self.access_token= x.get('value') |
| 70 | + elif name == '_ga': |
| 71 | + self._ga= x.get('value') |
| 72 | + elif name == '_gid': |
| 73 | + self._gid= x.get('value') |
| 74 | + |
| 75 | + time.sleep(2) |
| 76 | + |
| 77 | + if self._ga and self._gid and self.access_token is not None: |
| 78 | + print("LOGIN SUCCESS") |
| 79 | + else: |
| 80 | + print("LOGIN FAILED") |
| 81 | + exit |
| 82 | + |
| 83 | + def fetch_data(self): |
| 84 | + payload = json.dumps({ |
| 85 | + "cat::search": {} |
| 86 | + }) |
| 87 | + |
| 88 | + headers = { |
| 89 | + 'content-type': 'application/json', |
| 90 | + 'cookie': 'access_token='+self.access_token+'; _ga='+self._ga+'; _gid='+self._gid+'; _dc_gtm_UA-642192-18=1; _hjIncludedInSessionSample=1', |
| 91 | + } |
| 92 | + |
| 93 | + session= requests.session() |
| 94 | + |
| 95 | + if os.path.exists(self.xlsx_file_path): |
| 96 | + print('File already exists, removing it.') |
| 97 | + os.remove(self.xlsx_file_path) |
| 98 | + |
| 99 | + # Create an empty DataFrame |
| 100 | + df = pd.DataFrame() |
| 101 | + dfs= [] |
| 102 | + |
| 103 | + for index in range(self.max_number_of_pages): |
| 104 | + response = session.request("POST", self.data_url+str(index)+'&pageSize='+str(self.page_size), headers=headers, data=payload) |
| 105 | + y = json.loads(response.text) |
| 106 | + |
| 107 | + if y.get('results'): |
| 108 | + for z in y.get('results'): |
| 109 | + e_data = { |
| 110 | + 'Name': z.get('name'), |
| 111 | + 'DOB': z.get('dob'), |
| 112 | + 'Designation': z.get('c_designation'), |
| 113 | + 'Employee No': z.get('employeeno'), |
| 114 | + 'Employee ID': z.get('employeeid'), |
| 115 | + 'Email': z.get('email') |
| 116 | + } |
| 117 | + |
| 118 | + # Create a DataFrame from the current iteration data |
| 119 | + df = pd.DataFrame([e_data]) |
| 120 | + |
| 121 | + # Append the DataFrame to the list |
| 122 | + dfs.append(df) |
| 123 | + |
| 124 | + session.close() |
| 125 | + |
| 126 | + # Concatenate all DataFrames in the list |
| 127 | + df_merged = pd.concat(dfs, ignore_index=True) |
| 128 | + |
| 129 | + # Write the merged DataFrame to an Excel file |
| 130 | + df_merged.to_excel(self.xlsx_file_path, index=False) |
| 131 | + |
| 132 | + print("Data appended to the Excel file successfully.") |
| 133 | + |
| 134 | + # Read the XLSX file to remove duplicate data |
| 135 | + df = pd.read_excel(self.xlsx_file_path, engine='openpyxl') |
| 136 | + |
| 137 | + # Remove duplicate rows based on all columns |
| 138 | + df.drop_duplicates(inplace=True) |
| 139 | + |
| 140 | + # Write the updated DataFrame back to the Excel file |
| 141 | + df.to_excel(self.xlsx_file_path, index=False) |
| 142 | + |
| 143 | + print('Data cleaned - removed duplicate data') |
| 144 | + |
| 145 | + |
| 146 | + def show_charts(self): |
| 147 | + df = pd.read_excel(self.xlsx_file_path, engine='openpyxl') |
| 148 | + |
| 149 | + # Fetch the data from the specified column |
| 150 | + column_data = df['Designation'] |
| 151 | + |
| 152 | + # Count the occurrences of each value |
| 153 | + value_counts = column_data.value_counts() |
| 154 | + |
| 155 | + # Display bar graph for column values |
| 156 | + value_counts.plot(kind='bar') |
| 157 | + fig1 = plt.figure(1) |
| 158 | + plt.title('Designation chart') |
| 159 | + plt.xlabel('Values') |
| 160 | + plt.ylabel('Count') |
| 161 | + |
| 162 | + # Pie chart for month of employees born in |
| 163 | + # Update column with month |
| 164 | + df['Month'] = pd.to_datetime(df['DOB'], format='%d %b').dt.month |
| 165 | + |
| 166 | + df['Month'] = df['Month'].replace('', float('nan')).astype(float).astype('Int64') |
| 167 | + |
| 168 | + # Replace numeric month values with month names |
| 169 | + df['Month'] = df['Month'].apply(lambda x: calendar.month_name[x] if pd.notnull(x) else '') |
| 170 | + |
| 171 | + # Count the occurrences of each month |
| 172 | + month_counts = df['Month'].value_counts() |
| 173 | + |
| 174 | + # Plot the pie chart |
| 175 | + fig2 = plt.figure(2) |
| 176 | + plt.pie(month_counts, labels=month_counts.index, autopct='%1.1f%%') |
| 177 | + plt.title('Month-wise Birthday discribution') |
| 178 | + |
| 179 | + # Show both charts |
| 180 | + plt.show() |
| 181 | + |
| 182 | + |
| 183 | +def main(): |
| 184 | + exporter = DataExporter() |
| 185 | + exporter.login() |
| 186 | + exporter.fetch_data() |
| 187 | + exporter.show_charts() |
| 188 | + |
| 189 | +if __name__ == '__main__': |
| 190 | + main() |
0 commit comments