├── api ├── __init__.py └── routes.py ├── services ├── __init__.py ├── company_scraper.py ├── candidate_scraper.py └── scraping_utils.py ├── run.py ├── settings.py ├── requirements.txt ├── LICENSE ├── .gitignore └── README.md /api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | from api.routes import app 3 | 4 | if __name__ == '__main__': 5 | uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info") -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from os import getenv 3 | 4 | load_dotenv(override=True) 5 | 6 | LINKEDIN_ACCEESS_TOKEN = getenv('LINKEDIN_ACCEESS_TOKEN') 7 | LINKEDIN_ACCEESS_TOKEN_EXP = getenv('LINKEDIN_ACCEESS_TOKEN_EXP') 8 | HEADLESS = getenv('HEADLESS') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.6.0 2 | anyio==4.3.0 3 | attrs==23.2.0 4 | certifi==2024.2.2 5 | cffi==1.16.0 6 | charset-normalizer==3.3.2 7 | click==8.1.7 8 | colorama==0.4.6 9 | exceptiongroup==1.2.0 10 | fastapi==0.110.0 11 | h11==0.14.0 12 | idna==3.6 13 | outcome==1.3.0.post0 14 | packaging==24.0 15 | pycparser==2.21 16 | pydantic==2.6.4 17 | pydantic_core==2.16.3 18 | PySocks==1.7.1 19 | python-dotenv==1.0.1 20 | requests==2.31.0 21 | selenium==4.18.1 22 | sniffio==1.3.1 23 | sortedcontainers==2.4.0 24 | starlette==0.36.3 25 | trio==0.24.0 26 | trio-websocket==0.11.1 27 | typing_extensions==4.10.0 28 | urllib3==2.2.1 29 | uvicorn==0.28.0 30 | webdriver-manager==4.0.1 31 | wsproto==1.2.0 32 | -------------------------------------------------------------------------------- /api/routes.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException 2 | 3 | from services.candidate_scraper import scrape_linkedin_profile 4 | from services.company_scraper import scrape_linkedin_company 5 | 6 | app = FastAPI() 7 | 8 | @app.get("/profile-data/{linkedin_id}") 9 | async def profile_data(linkedin_id: str): 10 | try: 11 | profile_infos = scrape_linkedin_profile(linkedin_id) 12 | return profile_infos 13 | except Exception as e: 14 | raise HTTPException(status_code=500, detail="Error fetching profile details") 15 | 16 | @app.get("/comapny-data/{linkedin_id}") 17 | async def comapny_data(linkedin_id: str): 18 | try: 19 | profile_infos = scrape_linkedin_company(linkedin_id) 20 | return profile_infos 21 | except Exception as e: 22 | raise HTTPException(status_code=500, detail="Error fetching company details") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Driss Briksine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | tests -------------------------------------------------------------------------------- /services/company_scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from time import sleep 3 | from services.scraping_utils import options, service, search_for_company_name, search_for_company_industry, search_for_company_about, add_session_cookie 4 | 5 | 6 | def scrape_linkedin_company(linkedin_id): 7 | """Scraping linkedIn company data""" 8 | try: 9 | # Setup Selenium WebDriver 10 | driver = webdriver.Chrome(service=service,options=options) 11 | 12 | # Load cookies from the file 13 | add_session_cookie(driver) 14 | 15 | print(f'Scraping data for company id: {linkedin_id}') 16 | 17 | # LinkedIn URL for the company 18 | company_url = f"https://www.linkedin.com/company/{linkedin_id}/" 19 | 20 | # Navigate to the LinkedIn company 21 | driver.get(company_url) 22 | 23 | if "/unavailable" in driver.current_url or "Page not found" in driver.page_source: 24 | driver.quit() 25 | print(f"Company profile for {linkedin_id} not found (404)") 26 | return {"error": f"Company profile for {linkedin_id} not found."} 27 | 28 | sleep(1) 29 | 30 | # Scrape name,about form the LinkedIn company 31 | try: 32 | name = search_for_company_name(driver) 33 | if not name: 34 | driver.quit() 35 | print("scraping failed due to session token not setup or expired") 36 | return {"error": "Your Linkedin session token is not set up correctly or has expired"} 37 | industry = search_for_company_industry(driver) 38 | about = search_for_company_about(driver) 39 | except Exception as e: 40 | print(f"Error scraping details for company {linkedin_id} : {e}") 41 | return {"error": f"Error searching for details for company {linkedin_id}"} 42 | 43 | driver.quit() 44 | 45 | print(f"finished feching details for company {linkedin_id} successfully") 46 | return { 47 | "linkedin_id": linkedin_id, 48 | "name": name, 49 | "industry": industry, 50 | "about": about, 51 | } 52 | except Exception as e: 53 | print(f"Error feching details for comapny {linkedin_id} : {e}") 54 | return {"error": f"Error feching company details for {linkedin_id}"} -------------------------------------------------------------------------------- /services/candidate_scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from time import sleep 3 | from services.scraping_utils import options, service, search_for_candidate_name, search_for_candidate_headline, search_for_section, add_session_cookie 4 | 5 | 6 | def scrape_linkedin_profile(linkedin_id): 7 | """Scraping linkedIn profile data""" 8 | try: 9 | # Setup Selenium WebDriver 10 | driver = webdriver.Chrome(service=service,options=options) 11 | 12 | # Load cookies from the file 13 | add_session_cookie(driver) 14 | 15 | print(f'Scraping data for id: {linkedin_id}') 16 | 17 | # LinkedIn URL for the profile 18 | profile_url = f"https://www.linkedin.com/in/{linkedin_id}/" 19 | 20 | # Navigate to the LinkedIn profile 21 | driver.get(profile_url) 22 | 23 | if "/404" in driver.current_url or "Page not found" in driver.page_source: 24 | driver.quit() 25 | print(f"Profile for {linkedin_id} not found (404)") 26 | return {"error": f"Profile for {linkedin_id} not found."} 27 | 28 | sleep(1) 29 | 30 | # Scrape name,experinces,education form the LinkedIn profile 31 | try: 32 | name = search_for_candidate_name(driver) 33 | if not name: 34 | driver.quit() 35 | print("scraping failed due to session token not setup or expired") 36 | return {"error": "Your Linkedin session token is not set up correctly or has expired"} 37 | headline = search_for_candidate_headline(driver) 38 | education = search_for_section(driver,"Education") 39 | experience = search_for_section(driver,"Experience") 40 | except Exception as e: 41 | print(f"Error scraping details for {linkedin_id} : {e}") 42 | return {"error": f"Error searching for details for {linkedin_id}"} 43 | 44 | driver.quit() 45 | 46 | print(f"finished feching details for profile {linkedin_id} successfully") 47 | return { 48 | "linkedin_id": linkedin_id, 49 | "name": name, 50 | "headline": headline, 51 | "education": education, 52 | "experience": experience, 53 | } 54 | 55 | except Exception as e: 56 | print(f"Error feching details for {linkedin_id} : {e}") 57 | return {"error": f"Error feching profile details for {linkedin_id}"} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LinkedIn Scraper RESTful API 2 | 3 | ## Overview 4 | 5 | This project is a LinkedIn Scraper built using Selenium for web scraping and FastAPI for serving the scraped data via RESTful APIs. It allows users to fetch detailed profile and company information from LinkedIn based on the provided LinkedIn ID. 6 | 7 | ## Installation 8 | 9 | ### Prerequisites 10 | 11 | - Python 3.8+ 12 | - pip 13 | - git 14 | 15 | ### Setup 16 | 17 | 1. **Clone the Repository** 18 | - Clone this repository to your local machine using `git clone https://github.com/drissbri/linkedin-scraper`. 19 | 20 | 2. **Create and Activate Virtual Environment** 21 | - Navigate to the project directory. 22 | - Create a virtual environment by running `python -m venv venv`. 23 | - Activate the virtual environment: 24 | - On Windows, run `venv\Scripts\activate`. 25 | - On macOS/Linux, run `source venv/bin/activate`. 26 | 27 | 3. **Install Dependencies** 28 | - Install the required packages by running `pip install -r requirements.txt`. 29 | 30 | ## Configuration 31 | 32 | **Environment Variables** 33 | 34 | - Create a `.env` file in the root of the project. 35 | - Add the following environment variables: 36 | ``` 37 | LINKEDIN_ACCESS_TOKEN="YourLinkedInAccessToken" 38 | LINKEDIN_ACCESS_TOKEN_EXP=AccessTokenExpiration 39 | HEADLESS=True 40 | ``` 41 | - Replace `"YourLinkedInAccessToken"` and `"AccessTokenExpiration"` with your actual LinkedIn access token and its expiration time. 42 | - Change `"HEADLESS"` to False if you dont want the browser to open in headless mode. 43 | 44 | ### Obtaining LinkedIn Access Token with Cookie-Editor 45 | 46 | To fetch data using the LinkedIn Scraper, you'll need a LinkedIn access token. The following steps will guide you through obtaining this token using the Cookie-Editor browser addon: 47 | 48 | #### Step 1: Install Cookie-Editor 49 | 50 | - **Install Cookie-Editor**: Add the [Cookie-Editor](https://cookie-editor.com/) addon to your browser. It's available for [Chrome](https://chromewebstore.google.com/detail/cookie-editor/hlkenndednhfkekhgcdicdfddnkalmdm), [Firefox](https://addons.mozilla.org/en-US/firefox/addon/cookie-editor), and other popular browsers. Visit the addon store for your browser and search for "Cookie-Editor" to install. 51 | 52 | #### Step 2: Access LinkedIn 53 | 54 | - **Log into LinkedIn**: Open your browser and log into your LinkedIn account as you normally would. 55 | 56 | #### Step 3: Open Cookie-Editor 57 | 58 | - **Launch Cookie-Editor**: Once logged in, click on the Cookie-Editor icon in your browser's toolbar to open the addon. 59 | 60 | #### Step 4: Find the Access Token 61 | 62 | - **Search for Access Token**: In the Cookie-Editor interface, look for a cookie named `li_at` or similar. This cookie contains your LinkedIn access token. 63 | ![LinkedIn Cookie-Editor](https://i.imgur.com/JzdNF3n.png "Linkedin access token") 64 | - **Copy the Token**: Click on the `li_at` cookie to view its details, and copy the value. This is your LinkedIn access token. 65 | - **Copy the expiration time**: Convert the Expiration time into numeric format (timestamp) like `1743212774.0` , and copy the value. This is your LinkedIn access token expiration time. 66 | 67 | 68 | ### Notes 69 | - **Token Validity**: LinkedIn access tokens are temporary. Ensure to check the token's validity periodically and update it as needed. 70 | - **Privacy and Security**: Handle your access token securely as it grants access to your LinkedIn data. Do not share your token publicly. 71 | 72 | > [!IMPORTANT] 73 | > **If you continue to encounter issues with the access tokens being undefined or expired even after following these steps, please replace the values directly in the `settings.py` file.** 74 | 75 | ## Usage 76 | 77 | To start the server, run the following command in the root directory of the project: 78 | 79 | ```shell 80 | python run.py 81 | ``` 82 | 83 | This command will start the Uvicorn server and make the API accessible on `http://localhost:8000` by default. 84 | 85 | ## API Endpoints 86 | 87 | The LinkedIn Scraper offers endpoints for retrieving detailed profile and company information from LinkedIn. Here's what you can expect from each: 88 | 89 | ### Profile Data 90 | 91 | - **GET** `/profile-data/{linkedin_id}` 92 | - Fetches profile information for the specified LinkedIn ID. 93 | - **Path Parameters:** 94 | - `linkedin_id`: The unique LinkedIn ID of the profile. 95 | - **Response:** JSON object containing the profile information as per the provided format. 96 | - **Response Format:** 97 | ```json 98 | { 99 | "linkedin_id": "string", 100 | "name": "string", 101 | "headline": "string", 102 | "education": { 103 | "positions": ["string"], 104 | "institutions": ["string"], 105 | "dates": ["string"] 106 | }, 107 | "experience": { 108 | "positions": ["string"], 109 | "institutions": ["string"], 110 | "dates": ["string"] 111 | } 112 | } 113 | ``` 114 | - The `education` and `experience` fields are objects that include arrays of strings for positions, institutions, and dates, providing a concise summary of the individual's educational background and work history. 115 | 116 | ### Company Data 117 | 118 | - **GET** `/company-data/{linkedin_id}` 119 | - Fetches company information based on the given LinkedIn ID. 120 | - **Path Parameters:** 121 | - `linkedin_id`: The unique LinkedIn ID of the company. 122 | - **Response:** JSON object containing the company information as structured below. 123 | - **Response Format:** 124 | ```json 125 | { 126 | "linkedin_id": "string", 127 | "name": "string", 128 | "industry": "string", 129 | "about": "string" 130 | } 131 | ``` 132 | - This format outlines the company's LinkedIn ID, name, industry sector, and a brief description of the company in the `about` field. 133 | 134 | ## Error Handling 135 | 136 | Error handling is consistent across endpoints, aiming to provide meaningful feedback in case of failures: 137 | 138 | - **Example Error Response:** 139 | 140 | ```json 141 | { 142 | "detail": "Error fetching profile details" 143 | } 144 | ``` 145 | This response is returned with an HTTP status code of 500, indicating a server-side error during data fetching, along with a detail message explaining the error. 146 | 147 | ## Notes 148 | 149 | - Ensure that the LinkedIn access token is valid and not expired to avoid authentication errors. 150 | 151 | ## Project Structure 152 | 153 | - `api/routes.py` - Contains api routes. 154 | - `services/candidate_scraper.py` - contians the scraping function for individuals profiles. 155 | - `services/company_scraper.py` - contians the scraping function for companies profiles. 156 | - `services/scraping_utils.py` - contians the functions and options used in the scraping process for both profiles and companies. 157 | 158 | ## Contributing 159 | 160 | I welcome contributions from the community and I'm excited to see how you can improve and extend this LinkedIn Scraper project! If you're looking to contribute, here are a few ways you can do so: 161 | 162 | ### Reporting Bugs 163 | 164 | - **Submit an Issue**: If you find a bug or encounter an issue, please create an issue on our GitHub repository. Provide as much detail as possible, including steps to reproduce the issue, the expected outcome, and the actual outcome. 165 | 166 | ### Feature Requests 167 | 168 | - **Request a Feature**: Have an idea for a new feature or an enhancement to existing functionality? Submit a feature request through our issue tracker. Describe the feature, its potential benefits, and how it might work. 169 | 170 | ### Submitting Changes 171 | 172 | - **Fork the Repository**: Start by forking the repository to your GitHub account. 173 | - **Create a Branch**: Create a new branch for your changes. Use a clear and descriptive name for your branch, such as `fix-issue-1` or `add-new-feature`. 174 | - **Make Your Changes**: Implement your changes, adhering to the existing coding style as much as possible. 175 | - **Write Tests**: If you're adding new functionality or fixing a bug, please add tests to cover your changes. 176 | - **Document Your Changes**: Update the README or documentation with any necessary changes due to your contribution. 177 | - **Submit a Pull Request (PR)**: Once your changes are complete, submit a pull request to the main branch of the original repository. Include a clear description of your changes and any other relevant information. 178 | 179 | ### Code Review Process 180 | 181 | - **Review & Feedback**: After submitting a PR, the project maintainers will review your changes. Be open to feedback and ready to make adjustments as needed. 182 | - **Approval & Merge**: If your contribution is approved, the project maintainers will merge your changes into the main branch. 183 | 184 | ### General Guidelines 185 | 186 | - Ensure your code follows the project's coding conventions and best practices. 187 | - Keep your commits small and focused; it makes the review process easier. 188 | - Update any documentation that your changes might affect. 189 | 190 | I appreciate your interest in contributing to the LinkedIn Scraper project! By participating, you agree to abide by the code of conduct and collaboration guidelines. Let's build something great together! 191 | -------------------------------------------------------------------------------- /services/scraping_utils.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver.common.by import By 2 | from selenium.webdriver.chrome.options import Options 3 | from selenium.common.exceptions import NoSuchElementException 4 | from selenium.webdriver.chrome.service import Service 5 | from webdriver_manager.chrome import ChromeDriverManager 6 | 7 | from settings import LINKEDIN_ACCEESS_TOKEN, LINKEDIN_ACCEESS_TOKEN_EXP, HEADLESS 8 | 9 | # Setting up the options 10 | options = Options() 11 | if not HEADLESS=="False": 12 | options.add_argument("--headless=new") 13 | options.add_argument('--ignore-ssl-errors=yes') 14 | options.add_argument('--ignore-certificate-errors=yes') 15 | options.add_argument("--log-level=3") 16 | 17 | # Setting up service 18 | service = Service(ChromeDriverManager().install(), log_output='nul') 19 | 20 | def find_by_xpath_or_None(driver, *xpaths): 21 | """returns the text inside and elemnt by its xPath""" 22 | for xpath in xpaths: 23 | try: 24 | return driver.find_element(By.XPATH, xpath).text 25 | except NoSuchElementException: 26 | #print(f"Element not found : {xpath}") 27 | continue 28 | return None 29 | 30 | 31 | def search_for_candidate_name(driver): 32 | """search for profile's name in the page""" 33 | try: 34 | name = find_by_xpath_or_None(driver, '/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[1]/div[2]/div[2]/div[1]/div[1]/span/a/h1','/html/body/div[4]/div[3]/div/div/div[2]/div/div/main/section[1]/div[2]/div[2]/div[1]/div[1]/span/a/h1') 35 | return name 36 | except Exception as e: 37 | print(f"Error finding name: {e}") 38 | return None 39 | 40 | 41 | def search_for_candidate_headline(driver): 42 | """search for profile's headline in the page""" 43 | try: 44 | headline = find_by_xpath_or_None(driver, '/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[1]/div[2]/div[2]/div[1]/div[2]','/html/body/div[4]/div[3]/div/div/div[2]/div/div/main/section[1]/div[2]/div[2]/div[1]/div[2]') 45 | return headline 46 | except Exception as e: 47 | print(f"Error finding headline: {e}") 48 | return None 49 | 50 | 51 | def search_for_section(driver,section_name,min_index=2,max_index=8) : 52 | """search for a section's content by section name in the page""" 53 | try: 54 | # Initialize variables 55 | sectionIndex = min_index 56 | found_elements = { 57 | 'positions': [], 58 | 'institutions': [], 59 | 'dates': [] 60 | } 61 | 62 | # Function to add found elements to the dictionary 63 | def add_elements(position, institution, date): 64 | if position: found_elements['positions'].append(position) 65 | if institution: found_elements['institutions'].append(institution) 66 | if date: found_elements['dates'].append(date) 67 | 68 | # Loop through sections until "section_title" section is found 69 | while sectionIndex <= max_index : 70 | # Check if the section title matches "section_name" 71 | section_title = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[2]/div/div/div/h2/span[1]') 72 | if section_title == section_name: 73 | # Experience 74 | elementIndex = 1 75 | if section_name == "Experience" : 76 | while True: 77 | target_element_position = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/div/div/div/div/div/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[2]/ul/li[1]/div/div[2]/div/a/div/div/div/div/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[2]/ul/li[1]/div/div[2]/div/a/div/div/div/div/div/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/div/span[1]/span[1]') 78 | target_element_institution = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/div/span[1]/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/a/div/div/div/div/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/a/div/div/div/div/span[1]') 79 | target_element_date = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/div/span[2]/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div[1]/a/span[1]/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/div/span[2]/span[1]') 80 | if not target_element_position: 81 | break 82 | 83 | add_elements(target_element_position, target_element_institution, target_element_date) 84 | elementIndex += 1 85 | # Education 86 | if section_name == "Education" : 87 | while True: 88 | target_element_position = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/span[1]/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/span[1]/span[1]') 89 | target_element_institution = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/div/div/div/div/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/div/div/div/div/span[1]') 90 | target_element_date = find_by_xpath_or_None(driver, f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/span[2]/span[1]',f'/html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[{sectionIndex}]/div[3]/ul/li[{elementIndex}]/div/div[2]/div/a/span[2]/span[1]') 91 | 92 | if not target_element_position: 93 | break 94 | 95 | add_elements(target_element_position, target_element_institution, target_element_date) 96 | elementIndex += 1 97 | break 98 | sectionIndex += 1 # Move to the next section 99 | 100 | return found_elements 101 | except Exception as e: 102 | print(f"Error finding section :{e}") 103 | return None 104 | 105 | 106 | def search_for_company_name(driver): 107 | """search for comapny's name in the page""" 108 | try: 109 | company_name = find_by_xpath_or_None(driver, '/html/body/div[5]/div[3]/div/div[2]/div/div[2]/main/div[1]/section/div/div[2]/div[2]/div[1]/div[2]/div/h1') 110 | return company_name 111 | except Exception as e: 112 | print(f"Error finding company name: {e}") 113 | return None 114 | 115 | 116 | def search_for_company_industry(driver): 117 | """search for comapny's industry in the page""" 118 | try: 119 | company_industry = find_by_xpath_or_None(driver, '/html/body/div[4]/div[3]/div/div[2]/div/div[2]/main/div[1]/section/div/div[2]/div[2]/div[1]/div[2]/div/div/div[1]', '/html/body/div[5]/div[3]/div/div[2]/div/div[2]/main/div[1]/section/div/div[2]/div[2]/div[1]/div[2]/div/div/div[1]') 120 | return company_industry 121 | except Exception as e: 122 | print(f"Error finding company industry: {e}") 123 | return None 124 | 125 | 126 | def search_for_company_about(driver): 127 | """search for comapny's name in the page""" 128 | try: 129 | more_button = driver.find_element(By.XPATH, '/html/body/div[5]/div[3]/div/div[2]/div/div[2]/main/div[2]/div/div[1]/section/div/div/div[1]/div/span[3]/span/a') 130 | more_button.click() 131 | company_about = find_by_xpath_or_None(driver, '/html/body/div[5]/div[3]/div/div[2]/div/div[2]/main/div[2]/div/div[1]/section/div/div/div[1]/div/span[1]') 132 | return company_about 133 | except Exception as e: 134 | print(f"Error finding company about: {e}") 135 | return None 136 | 137 | 138 | def add_session_cookie(driver): 139 | """load cookies from a file and add it to the driver""" 140 | cookie = { 141 | "domain": ".www.linkedin.com", 142 | "name": "li_at", 143 | "value": LINKEDIN_ACCEESS_TOKEN, 144 | "path": "/", 145 | "secure": True, 146 | "httpOnly": True, 147 | "expirationDate":LINKEDIN_ACCEESS_TOKEN_EXP, 148 | } 149 | # Add cookies to the driver 150 | try: 151 | driver.get("https://www.linkedin.com") 152 | driver.add_cookie(cookie) 153 | except Exception as e: 154 | print(f"Error adding cookies to driver : {e}") 155 | --------------------------------------------------------------------------------