Scrape LinkedInJobs
import axios from 'axios';
import * as cheerio from 'cheerio';
import mongoose from 'mongoose';
import { getHeaders, getLogUrl, getProxies, Mutex } from '../../../commonUtils.js';
import { SPJobs } from '../models/jobModels.js';
import { logError, logInfo } from '../../../logger.js';
import JobConfig from '../../../jobConfig.js';
import { sendSlackNotification, sendStartNotification } from '../../../notificationUtils.js';
// Function to check if a string is a valid country
function isCountry(text) {
// List of known countries (you can expand this list as needed)
const countries = ["India", "United States", "China", "Japan", "Germany", "United Kingdom", "France", "Brazil", "Italy", "Canada",
"South Korea", "Russia", "Australia", "Spain", "Mexico", "Indonesia", "Netherlands", "Saudi Arabia", "Turkey", "Switzerland",
"Argentina", "Sweden", "Poland", "Belgium", "Thailand", "Iran", "Austria", "Norway", "United Arab Emirates", "Israel", "Singapore",
"Malaysia", "South Africa", "Philippines", "Denmark", "Nigeria", "Egypt", "Pakistan", "Vietnam", "Bangladesh", "Colombia", "Chile",
"Finland", "Ireland", "Portugal", "Greece", "Czech Republic", "Iraq", "Qatar", "Peru", "New Zealand", "Kazakhstan", "Algeria", "Hungary",
"Ukraine", "Morocco", "Ecuador", "Slovakia", "Kenya", "Dominican Republic", "Ethiopia", "Oman", "Costa Rica", "Panama", "Cuba", "Sri Lanka",
"Bulgaria", "Myanmar", "Tanzania", "Ghana", "Uzbekistan", "Angola", "Croatia", "Lebanon", "Jordan", "Serbia", "Tunisia", "Bolivia", "Nepal",
"Libya", "Uganda", "Cambodia", "El Salvador", "Paraguay", "Honduras", "Zimbabwe", "Zambia", "Senegal", "Cyprus", "Iceland", "Luxembourg", "Mongolia",
"Namibia", "Botswana", "Mauritius", "Malta", "Brunei", "Bahamas", "Barbados", "Suriname", "Fiji", "Maldives", "Belize", "Bhutan", "Seychelles",
"Samoa", "Vanuatu", "Tonga", "Palau", "Nauru", "Tuvalu",];
return countries.includes(text.trim());
}
// Function to fetch country name using Nominatim API
async function fetchCountryFromAPI(location) {
try {
const response = await axios.get(`https://nominatim.openstreetmap.org/search`, {
params: {
q: location,
format: 'json',
},
});
if (response.data && response.data.length > 0) {
// Extract the country from the first result
const country = response.data[0].display_name.split(',').pop().trim();
return country;
}
} catch (error) {
console.error('Error fetching country from Nominatim API:', error);
}
return null; // Return null if no country is found or an error occurs
}
// Function to extract country from HTML
async function extractCountry(locationText) {
let location = { city: '', country: '' };
// Check if the location text contains a comma (indicating a detailed location)
if (locationText.includes(',')) {
const addressParts = locationText ? locationText.split(',').map(part => part.trim()) : [];
location.city = addressParts.length > 1 ? addressParts.slice(0, -1).join(', ') : ""; // Join all parts except the last
const country = addressParts.length > 0 ? addressParts[addressParts.length - 1] : ""; // Extract the country (last part)
//Check if the last part is a valid country, otherwise fetch from API
location.country = isCountry(country) ? country : await fetchCountryFromAPI(locationText);
} else {
// If the location text is not a valid country, fetch the country using the Nominatim API
location.country = isCountry(locationText) ? locationText : await fetchCountryFromAPI(locationText);
}
return location;
}
function convertRelativeTimeToDate(relativeTime) {
// Get the current date and time
const now = new Date();
// Split the relative time string into parts
const parts = relativeTime.split(' ');
if (parts.length !== 3 || parts[2] !== 'ago') {
throw new Error('Invalid relative time format. Expected format: "X unit ago"');
}
const amount = parseInt(parts[0], 10); // Extract the number (e.g., 1, 10, etc.)
const unit = parts[1]; // Extract the unit (e.g., day, hour, month, etc.)
// Calculate the time difference based on the unit
switch (unit) {
case 'second':
case 'seconds':
now.setSeconds(now.getSeconds() - amount);
break;
case 'minute':
case 'minutes':
now.setMinutes(now.getMinutes() - amount);
break;
case 'hour':
case 'hours':
now.setHours(now.getHours() - amount);
break;
case 'day':
case 'days':
now.setDate(now.getDate() - amount);
break;
case 'week':
case 'weeks':
now.setDate(now.getDate() - amount * 7);
break;
case 'month':
case 'months':
now.setMonth(now.getMonth() - amount);
break;
case 'year':
case 'years':
now.setFullYear(now.getFullYear() - amount);
break;
default:
now;
}
return now;
}
async function fetchPage(url, headers, proxy) {
try {
if (proxy) {
const { ip, port, protocol } = proxy
const auth = {
username: process.env.PROXY_USER,
password: process.env.PROXY_PASSWORD,
}
const pr = {
protocol: 'http',
host: ip,
port: Number(port),
auth
};
const response = await axios.get(url, {
headers,
proxy: pr,
maxRedirects: 0,
validateStatus: status => status >= 200 && status < 400 // Handle only successful and redirect responses
});
await delay(15000);
return cheerio.load(response.data);
}
} catch (error) {
if (error && !error.message.includes("status code 429")) {
console.error(`Error fetching URL: IP: ${proxy}, ${url}`, error.message);
}
throw new Error(`Error occurred Fetch: ${error}`,);
}
}
function parseJobCards($) {
const jobs = [];
$('div.base-search-card__info').each((_, element) => {
const title = $(element).find('h3').text().trim();
const company = $(element).find('a.hidden-nested-link').text().trim();
const location = $(element).find('span.job-search-card__location').text().trim();
const parentDiv = $(element).parent();
const entityUrn = parentDiv.attr('data-entity-urn');
const jobId = entityUrn ? entityUrn.split(':').pop() : '';
const jobUrl = `https://www.linkedin.com/jobs/view/${jobId}/`;
const dateTag = $(element).find('time.job-search-card__listdate');
const date = dateTag.attr('datetime') || '';
const includeKeywords = ['shopify', 'e-commerce'];
if (includeKeywords.some(keyword => title.toLowerCase().includes(keyword)) || true) {
jobs.push({
title,
company,
location,
date,
jobUrl,
jobDescription: '',
});
}
});
return jobs;
}
const safeTrim = (text) => (text ? text.trim() : "");
async function fetchJobDescription(url, headers, proxy) {
const $ = await fetchPage(url, headers, proxy);
if (!$) return "Could not fetch job description";
const jobTitle = safeTrim($("h3.sub-nav-cta__header").text());
// Extract address and split into city and country
const addressText = safeTrim($('span.sub-nav-cta__meta-text').text());
const location = await extractCountry(addressText)
// Extract publish date
const publishDate = convertRelativeTimeToDate(safeTrim($("span.posted-time-ago__text").text()));
// Extract company name and URL
const companyName = safeTrim($('a.sub-nav-cta__optional-url').text());
const companyUrl = $('a.sub-nav-cta__optional-url').attr('href') || "";
const icon = $('img.artdeco-entity-image').attr('data-delayed-url') || "";
// Replace "amp;" with an empty string
const cleanedIconUrl = icon.replace(/amp;/g, '');
const company = {
name: companyName,
url: companyUrl,
icon: cleanedIconUrl,
};
// Extract job ID from URL
const parts = url.split("/");
const jobId = parts[parts.length - 2];
// Extract job criteria
const criteria = [];
$('.description__job-criteria-item').each((index, element) => {
const title = safeTrim($(element).find('.description__job-criteria-subheader').text());
const value = safeTrim($(element).find('.description__job-criteria-text').text());
criteria.push({ title, value });
});
// Extract apply URL
let applyUrl = "";
const codeElement = $('#applyUrl').html();
if (codeElement) {
const fullUrl = codeElement.match(/"(https?:\/\/.*?)"/)?.[1];
if (fullUrl) {
// Extract the `url` parameter value using URLSearchParams
const urlParams = new URLSearchParams(fullUrl.split('?')[1]);
const extractedUrl = urlParams.get('url');
if (extractedUrl) {
// Decode the extracted URL
applyUrl = decodeURIComponent(extractedUrl);
}
}
}
// Extract job description
const description = safeTrim($('div.show-more-less-html__markup').html());
const salary = safeTrim($('div.salary.compensation__salary').text());
const jobDetails = {
jobId: jobId,
title: jobTitle,
description: description,
jobUrl: url,
applyUrl: applyUrl,
publishDate: publishDate,
criteria: criteria,
location: location,
source: 'LinkedIn',
company: company,
salary: salary
}
return jobDetails
}
function filterJobs(jobs, config) {
return jobs.filter(job => {
return (
// // config.languages.includes(detectLanguage(job.jobDescription))&&
// config.titleInclude.some(keyword => job.title.toLowerCase().includes(keyword.toLowerCase()))
// && !config.titleExclude.some(keyword => job.title.toLowerCase().includes(keyword.toLowerCase()))
// // && !config.companyExclude.some(keyword => job.company.toLowerCase().includes(keyword.toLowerCase()))
config.descriptionTags.some(tag => job.description.toLowerCase().includes(tag.toLowerCase()))
);
});
}
function detectLanguage(text) {
// Simulate language detection (extend this with a library if needed)
return text.match(/[\u4E00-\u9FFF]/) ? 'zh' : 'en';
}
function delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function LinkedinJobs() {
try {
const startCrawlTime = new Date().toISOString();
JobConfig.initPlatform("Shopify");
JobConfig.initSection("Jobs_linkedin");
const jobId = JobConfig.initJobId();
await sendStartNotification();
logInfo(`Start Time: ${startCrawlTime}`);
const proxies = await getProxies();
const headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'sec-ch-prefers-color-scheme': 'dark',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
};
const config = {
"OpenAI_API_KEY": "",
"OpenAI_Model": "",
"resume_path": "full local path to your resume in PDF format",
"searchQueries": [
{ "keywords": "Shopify developer", "location": "", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "India", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "USA", "f_WT": "2" },
{ "keywords": "Shopify developer", "location": "Canada", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "United Kingdom", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "Germany", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "Netherlands", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "Singapore", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "France", "f_WT": "" },
{ "keywords": "Shopify developer", "location": "South Africa", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "India", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "USA", "f_WT": "2" },
{ "keywords": "shopify jobs", "location": "Canada", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "United Kingdom", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "Germany", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "Netherlands", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "Singapore", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "France", "f_WT": "" },
{ "keywords": "shopify jobs", "location": "South Africa", "f_WT": "" }
// { "keywords": "Shopify developer", "location": "Brazil", "f_WT": "" },
// { "keywords": "Shopify developer", "location": "Philippines", "f_WT": "" },
// { "keywords": "Shopify developer", "location": "Pakistan", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "India", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "USA", "f_WT": "2" },
// { "keywords": "Shopify e-commerce developer", "location": "Canada", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "United Kingdom", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Australia", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Germany", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Netherlands", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Singapore", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "France", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Brazil", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "South Africa", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Philippines", "f_WT": "" },
// { "keywords": "Shopify e-commerce developer", "location": "Pakistan", "f_WT": "" }
],
"descriptionTags": ["Shopify"],
"titleExclude": ["frontend", "fron end", "game"],
"titleInclude": [],
"companyExclude": ["ClickJobs.io"],
"languages": ["en"],
"timespan": "r84600",
"jobs_tablename": "jobs",
"filtered_jobs_tablename": "filtered_jobs",
"db_path": "./data/my_database.db",
"pagesToScrape": 30,
"rounds": 1,
"days_to_scrape": 10,
"app_table": "jobs"
};
const CONCURRENCY_SIZE = 10;
const RETRY_LIMIT = 9;
const INSERT_BATCH_SIZE = 50;
const appUrlsQueueMutex = new Mutex();
const allJobs = [];
let allJobDetails = [];
let appUrlsQueue = [];
let appUrlsErrorQueue = [];
let writeMutex = false;
const retryStatus = [];
let totalCrawledJobs = 0;
let todayJobCount = 0;
for (const query of config.searchQueries) {
const keywords = encodeURIComponent(query.keywords);
const location = encodeURIComponent(query.location);
for (let i = 0; i < config.pagesToScrape; i++) {
const url = `https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=${keywords}&location=${location}&start=${i * 25}`;
appUrlsQueue.push(url);
}
}
const writeIntoDbSafely = async (config, isEnd = false) => {
if (!writeMutex) {
writeMutex = true;
try {
if (allJobDetails.length >= INSERT_BATCH_SIZE || isEnd) {
const jobs_data = filterJobs(allJobDetails, config);
logInfo(`UnFiltered Jobs: ${allJobDetails.length}, Filtered Jobs: ${jobs_data.length}`);
allJobDetails = [];
await writeIntoDb(jobs_data);
}
} finally {
writeMutex = false;
}
}
};
const processApp = async (proxyIterator, jobDetail) => {
if (appUrlsQueue.length === 0) {
return;
}
await appUrlsQueueMutex.lock();
const shopify_url = appUrlsQueue.shift();
appUrlsQueueMutex.unlock();
if (shopify_url) {
const proxy = proxyIterator.next().value;
try {
if (!jobDetail) {
const response = await fetchPage(shopify_url, headers, proxy);
if (response) {
const jobs = parseJobCards(response);
allJobs.push(...jobs);
logInfo(`Scraped page ${shopify_url}`);
} else {
console.log(`Failed to scrape page ${shopify_url}`);
}
} else {
const jobDetails = await fetchJobDescription(shopify_url, headers, proxy);
totalCrawledJobs++;
await writeIntoDbSafely(config);
// if (jobDetails.publishDate.includes("hours ago") || jobDetails.publishDate.includes("minutes ago") || jobDetails.publishDate.includes("Just now") ||) {
// todayJobCount++;
// }
allJobDetails.push(jobDetails);
}
} catch (error) {
appUrlsErrorQueue.push(shopify_url);
if (error && (error.message.includes("status code 429") || error.message.includes("auth wall"))) {
} else {
logError(` processApp URL: ${shopify_url} , ${proxy.ip} Error ::${error},`);
}
}
return processApp(proxyIterator, jobDetail);
}
};
function* proxyGenerator(proxies) {
let index = 0;
while (true) {
yield proxies[index];
index = (index + 1) % proxies.length;
}
}
let retry = 0;
const taskGenerator = async () => {
while (true) {
const proxyIterator = proxyGenerator(proxies);
const workerPromises = Array.from({ length: CONCURRENCY_SIZE }, () => processApp(proxyIterator, jobDetail));
await Promise.all(workerPromises);
await delay(60000);
if (appUrlsErrorQueue.length == 0 || retry >= RETRY_LIMIT) {
break;
} else {
retry++;
retryStatus.push({ "Retry No": retry, "Total Retry Apps": appUrlsErrorQueue.length })
logInfo(`Retrying URL Length: ${appUrlsErrorQueue.length}`);
appUrlsQueue.push(...appUrlsErrorQueue);
appUrlsErrorQueue = [];
}
}
}
let jobDetail = false;
await taskGenerator();
jobDetail = true;
// Create a Set to store unique URLs
const uniqueUrls = new Set();
// Iterate through all jobs and add unique URLs to the Set
allJobs.forEach(job => {
uniqueUrls.add(job.jobUrl);
});
// Convert the Set back to an array (if needed)
appUrlsQueue = Array.from(uniqueUrls);
const duplicateUrls = allJobs.length - appUrlsQueue.length;
const jobUrlsCount = appUrlsQueue.length;
console.log(`Unique URLs: ${appUrlsQueue.length} Old Url Count: ${allJobs.length}`);
retry = 0;
await taskGenerator();
await writeIntoDbSafely(config, allJobDetails.length > 0);
const endCrawlTime = new Date().toISOString();
const startTime = new Date(startCrawlTime);
const endTime = new Date(endCrawlTime);
const timeDifference = endTime - startTime;
const totalTimeTaken = (timeDifference / (1000 * 60)).toFixed(2);
logInfo(` Total time taken: ${totalTimeTaken} min for ${jobUrlsCount} URLs`);
const crawlStatus = {
jobId: jobId,
retryCount: retry,
duplicateUrls,
jobUrlsCount,
totalCrawledJobs,
totalFailed: appUrlsErrorQueue.length,
retryStatus: retryStatus,
startCrawlTime: startCrawlTime,
endCrawlTime: endCrawlTime,
totalTimeTaken: totalTimeTaken
};
await handleCompletion(crawlStatus);
} catch (error) {
logError(`Linkedin_Jobs Crawler: ${error}`);
}
}
const handleCompletion = async (crawlStatus) => {
const message = `*Crawler:* ${JobConfig.getPlatform()} ${JobConfig.getSection()} Crawling
*Status:* Finished (JOB_ID: ${crawlStatus.jobId})
*Crawl Time:* ${crawlStatus.totalTimeTaken} min (Start: ${crawlStatus.startCrawlTime} - End: ${crawlStatus.endCrawlTime})
*Total Jobs:* ${crawlStatus.jobUrlsCount}
*Jobs Crawled:* ${crawlStatus.totalCrawledJobs}
Jobs Failed: ${crawlStatus.totalFailed}
Retry Count: ${crawlStatus.retryCount} ${JSON.stringify(crawlStatus.retryStatus)}
Log: ${await getLogUrl()}`;
await sendSlackNotification(message, message);
// await sendEmailNotification('App Crawl Completed', message);
JobConfig.cleanup();
};
const writeIntoDb = async (jobData) => {
try {
const startTime = Date.now();
const isValidData = async data => {
const { error } = await SPJobs.validate(data); // Assuming AppData is a Mongoose model
if (error) {
const t = data;
}
return !error; // Return true if no validation errors
};
const bulkOps = jobData
.filter(data => data.jobId)
.map((data) => {
return {
updateOne: {
filter: { jobId: data.jobId }, // Match based on jobId
update: { $set: data }, // Update fields
upsert: true, // Insert if not found
}
};
});
if (bulkOps.length > 0) {
await SPJobs.bulkWrite(bulkOps);
logInfo(`DB_STORE Jobs completed: ${bulkOps.length}`);
}
const endTime = Date.now();
const duration = (endTime - startTime) / 1000
const ra = jobData.length;
logInfo(`LinkedinJobs Length:${ra} Duration: ${duration} s, per Job Duration:${duration / ra}`, "dbstore");
} catch (e) {
logError(`Inserting in DB:: ${e}`, "dberror");
throw new Error(`Inserting in DB: ${e}`);
}
}
// await ScrapJobs()
export default LinkedinJobs;
Comments
Post a Comment