最近在學(xué)習(xí)nodejs爬蟲技術(shù),,學(xué)了request模塊,所以想著寫一個自己的爬蟲項目,研究了半天,最后選定indeed作為目標(biāo)網(wǎng)站,,通過爬取indeed的職位數(shù)據(jù),然后開發(fā)一個自己的職位搜索引擎,,目前已經(jīng)上線了,,雖然功能還是比較簡單,,但還是貼一下網(wǎng)址job search engine,,證明一下這個爬蟲項目是有用的。下面就來講講整個爬蟲的思路,。 確定入口頁面眾所周知,,爬蟲是需要入口頁面的,通過入口頁面,,不斷的爬取鏈接,,最后爬取完整個網(wǎng)站。在這個第一步的時候,,就遇到了困難,,一般來說都是選取首頁和列表頁作為入口頁面的,但是indeed的列表頁面做了限制,,不能爬取完整的列表,,頂多只能抓取前100頁,但是這沒有難倒我,,我發(fā)現(xiàn)indeed有一個Browse Jobs 頁面,,通過這個頁面,可以獲取indeed按地區(qū)搜索和按類型搜索的所有列表。下面貼一下這個頁面的解析代碼,。 start: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false });
$('#states > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href'));
tasks.push({ _id: md5(url), type: 'city', host, url, done: 0, name: $(ele).text() });
});
$('#categories > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href'));
tasks.push({ _id: md5(url), type: 'category', host, url, done: 0, name: $(ele).text() });
}); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-start insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) { console.error(`${host}-start parse ${page.url} ${err}`); return 0;
}
}
通過cheerio解析html內(nèi)容,,把按地區(qū)搜索和按類型搜索鏈接插入到數(shù)據(jù)庫中。 爬蟲架構(gòu)這里簡單講一下我的爬蟲架構(gòu)思路,,數(shù)據(jù)庫選用mongodb,。每一個待爬取的頁面存一條記錄page,包含id,url,done,type,host等字段,,id用md5(url) 生成,,避免重復(fù)。每一個type有一個對應(yīng)的html內(nèi)容解析方法,,主要的業(yè)務(wù)邏輯都集中在這些解析方法里面,,上面貼出來的代碼就是例子。 爬取html采用request模塊,,進行了簡單的封裝,,把callback封裝成promise,方便使用async和await方式調(diào)用,,代碼如下,。 const req = require('request');const request = req.defaults({ headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}, timeout: 30000, encoding: null});const fetch = (url) => new Promise((resolve) => { console.log(`down ${url} started`);
request(encodeURI(url), (err, res, body) => { if (res && res.statusCode === 200) { console.log(`down ${url} 200`);
resolve(body);
} else { console.error(`down ${url} ${res && res.statusCode} ${err}`); if (res && res.statusCode) {
resolve(res.statusCode);
} else { // ESOCKETTIMEOUT 超時錯誤返回600
resolve(600);
}
}
});
});
做了簡單的反反爬處理,把user-agent改成電腦通用的user-agent,,設(shè)置了超時時間30秒,,其中encoding: null 設(shè)置request直接返回buffer,而不是解析后的內(nèi)容,,這樣的好處是如果頁面是gbk或者utf-8編碼,,只要解析html的時候指定編碼就行了,如果這里指定encoding: utf-8 ,,則當(dāng)頁面編碼是gbk的時候,,頁面內(nèi)容會亂碼。 request默認是回調(diào)函數(shù)形式,,通過promise封裝,,如果成功,則返回頁面內(nèi)容的buffer,,如果失敗,,則返回錯誤狀態(tài)碼,如果超時,,則返回600,,這些懂nodejs的應(yīng)該很好理解。 完整的解析代碼const URL = require('url');const md5 = require('md5');const cheerio = require('cheerio');const iconv = require('iconv-lite');const json = (data) => {
let res; try {
res = JSON.parse(data);
} catch (err) {
console.error(err);
} return res;
};const rules = [
/\/jobs\?q=.*&sort=date&start=\d+/,
/\/jobs\?q=&l=.*&sort=date&start=\d+/
];const fns = {
start: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false });
$('#states > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href'));
tasks.push({ _id: md5(url), type: 'city', host, url, done: 0, name: $(ele).text() });
});
$('#categories > tbody > tr > td > a').each((i, ele) => { const url = URL.resolve(page.url, $(ele).attr('href'));
tasks.push({ _id: md5(url), type: 'category', host, url, done: 0, name: $(ele).text() });
}); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-start insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-start parse ${page.url} ${err}`); return 0;
}
},
city: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const cities = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false });
$('#cities > tbody > tr > td > p.city > a').each((i, ele) => { // https://www./l-Charlotte,-NC-jobs.html
let tmp = $(ele).attr('href').match(/l-(?<loc>.*)-jobs.html/u); if (!tmp) {
tmp = $(ele).attr('href').match(/l=(?<loc>.*)/u);
} const { loc } = tmp.groups; const url = `https://www./jobs?l=${decodeURIComponent(loc)}&sort=date`;
tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 });
cities.push({ _id: `${$(ele).text()}_${page.name}`, parent: page.name, name: $(ele).text(), url });
});
let res = await global.com.city.insertMany(cities, { ordered: false }).catch(() => {});
res && console.log(`${host}-city insert ${res.insertedCount} from ${cities.length} cities`);
res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-city insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-city parse ${page.url} ${err}`); return 0;
}
},
category: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const categories = []; try { const $ = cheerio.load(iconv.decode(page.con, 'utf-8'), { decodeEntities: false });
$('#titles > tbody > tr > td > p.job > a').each((i, ele) => { const { query } = $(ele).attr('href').match(/q-(?<query>.*)-jobs.html/u).groups; const url = `https://www./jobs?q=${decodeURIComponent(query)}&sort=date`;
tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 });
categories.push({ _id: `${$(ele).text()}_${page.name}`, parent: page.name, name: $(ele).text(), url });
});
let res = await global.com.category.insertMany(categories, { ordered: false }).catch(() => {});
res && console.log(`${host}-category insert ${res.insertedCount} from ${categories.length} categories`);
res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-category insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-category parse ${page.url} ${err}`); return 0;
}
},
search: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const durls = []; try { const con = iconv.decode(page.con, 'utf-8'); const $ = cheerio.load(con, { decodeEntities: false }); const list = con.match(/jobmap\[\d+\]= {.*}/g); const jobmap = []; if (list) { // eslint-disable-next-line no-eval
list.map((item) => eval(item));
} for (const item of jobmap) { const cmplink = URL.resolve(page.url, item.cmplnk); const { query } = URL.parse(cmplink, true);
let name; if (query.q) { // eslint-disable-next-line prefer-destructuring
name = query.q.split(' #')[0].split('#')[0];
} else { const tmp = cmplink.match(/q-(?<text>.*)-jobs.html/u); if (!tmp) { // eslint-disable-next-line no-continue
continue;
} const { text } = tmp.groups; // eslint-disable-next-line prefer-destructuring
name = text.replace(/-/g, ' ').split(' #')[0];
} const surl = `https://www./cmp/_cs/cmpauto?q=${name}&n=10&returnlogourls=1&returncmppageurls=1&caret=8`;
const burl = `https://www./viewjob?jk=${item.jk}&from=vjs&vjs=1`;
const durl = `https://www./rpc/jobdescs?jks=${item.jk}`;
tasks.push({ _id: md5(surl), type: 'suggest', host, url: surl, done: 0 });
tasks.push({ _id: md5(burl), type: 'brief', host, url: burl, done: 0 });
durls.push({ _id: md5(durl), type: 'detail', host, url: durl, done: 0 });
}
$('a[href]').each((i, ele) => { const tmp = URL.resolve(page.url, $(ele).attr('href')); const [url] = tmp.split('#'); const { path, hostname } = URL.parse(url); for (const rule of rules) { if (rule.test(path)) { if (hostname == host) { // tasks.push({ _id: md5(url), type: 'list', host, url: decodeURI(url), done: 0 });
} break;
}
}
});
let res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-search insert ${res.insertedCount} from ${tasks.length} tasks`);
res = await global.com.task.insertMany(durls, { ordered: false }).catch(() => {});
res && console.log(`${host}-search insert ${res.insertedCount} from ${durls.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-search parse ${page.url} ${err}`); return 0;
}
},
suggest: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const companies = []; try { const con = page.con.toString('utf-8'); const data = json(con); for (const item of data) { const id = item.overviewUrl.replace('/cmp/', ''); const cmpurl = `https://www./cmp/${id}`;
const joburl = `https://www./cmp/${id}/jobs?clearPrefilter=1`;
tasks.push({ _id: md5(cmpurl), type: 'company', host, url: cmpurl, done: 0 });
tasks.push({ _id: md5(joburl), type: 'jobs', host, url: joburl, done: 0 });
companies.push({ _id: id, name: item.name, url: cmpurl });
}
let res = await global.com.company.insertMany(companies, { ordered: false }).catch(() => {});
res && console.log(`${host}-suggest insert ${res.insertedCount} from ${companies.length} companies`);
res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-suggest insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-suggest parse ${page.url} ${err}`); return 0;
}
}, // list: () => {},
jobs: async (page) => { const host = URL.parse(page.url).hostname; const tasks = []; const durls = []; try { const con = iconv.decode(page.con, 'utf-8'); const tmp = con.match(/window._initialData=(?<text>.*);<\/script><script>window._sentryData/u);
let data; if (tmp) { const { text } = tmp.groups;
data = json(text); if (data.jobList && data.jobList.pagination && data.jobList.pagination.paginationLinks) { for (const item of data.jobList.pagination.paginationLinks) { // eslint-disable-next-line max-depth
if (item.href) {
item.href = item.href.replace(/\u002F/g, '/'); const url = URL.resolve(page.url, decodeURI(item.href));
tasks.push({ _id: md5(url), type: 'jobs', host, url: decodeURI(url), done: 0 });
}
}
} if (data.jobList && data.jobList.jobs) { for (const job of data.jobList.jobs) { const burl = `https://www./viewjob?jk=${job.jobKey}&from=vjs&vjs=1`;
const durl = `https://www./rpc/jobdescs?jks=${job.jobKey}`;
tasks.push({ _id: md5(burl), type: 'brief', host, url: burl, done: 0 });
durls.push({ _id: md5(durl), type: 'detail', host, url: durl, done: 0 });
}
}
} else {
console.log(`${host}-jobs ${page.url} has no _initialData`);
}
let res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-search insert ${res.insertedCount} from ${tasks.length} tasks`);
res = await global.com.task.insertMany(durls, { ordered: false }).catch(() => {});
res && console.log(`${host}-search insert ${res.insertedCount} from ${durls.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-jobs parse ${page.url} ${err}`); return 0;
}
},
brief: async (page) => { const host = URL.parse(page.url).hostname; try { const con = page.con.toString('utf-8'); const data = json(con);
data.done = 0;
data.views = 0;
data.host = host; // format publish date
if (data.vfvm && data.vfvm.jobAgeRelative) { const str = data.vfvm.jobAgeRelative; const tmp = str.split(' '); const [first, second] = tmp; if (first == 'Just' || first == 'Today') {
data.publishDate = Date.now();
} else { const num = first.replace(/\+/, ''); if (second == 'hours') { const date = new Date(); const time = date.getTime(); // eslint-disable-next-line no-mixed-operators
date.setTime(time - num * 60 * 60 * 1000);
data.publishDate = date.getTime();
} else if (second == 'days') { const date = new Date(); const time = date.getTime(); // eslint-disable-next-line no-mixed-operators
date.setTime(time - num * 24 * 60 * 60 * 1000);
data.publishDate = date.getTime();
} else {
data.publishDate = Date.now();
}
}
}
await global.com.job.updateOne({ _id: data.jobKey }, { $set: data }, { upsert: true }).catch(() => { }); const tasks = []; const url = `https://www./jobs?l=${data.jobLocationModel.jobLocation}&sort=date`;
tasks.push({ _id: md5(url), type: 'search', host, url, done: 0 }); const res = await global.com.task.insertMany(tasks, { ordered: false }).catch(() => {});
res && console.log(`${host}-brief insert ${res.insertedCount} from ${tasks.length} tasks`); return 1;
} catch (err) {
console.error(`${host}-brief parse ${page.url} ${err}`); return 0;
}
},
detail: async (page) => { const host = URL.parse(page.url).hostname; try { const con = page.con.toString('utf-8'); const data = json(con); const [jobKey] = Object.keys(data);
await global.com.job.updateOne({ _id: jobKey }, { $set: { content: data[jobKey], done: 1 } }).catch(() => { }); return 1;
} catch (err) {
console.error(`${host}-detail parse ${page.url} ${err}`); return 0;
}
},
run: (page) => { if (page.type == 'list') {
page.type = 'search';
} const fn = fns[page.type]; if (fn) { return fn(page);
}
console.error(`${page.url} parser not found`); return 0;
}
};
module.exports = fns;
每一個解析方法都會插入一些新的鏈接,,新的鏈接記錄都會有一個type字段,,通過type字段,,可以知道新的鏈接的解析方法,這樣就能完整解析所有的頁面了,。例如start方法會插入type為city和category的記錄,,type為city的頁面記錄的解析方法就是city 方法,city方法里面又會插入type為search的鏈接,,這樣一直循環(huán),,直到最后的brief和detail方法分別獲取職位數(shù)據(jù)的簡介和詳細內(nèi)容。 其實爬蟲最關(guān)鍵的就是這些html解析方法,,有了這些方法,,你就能獲取任何想要的結(jié)構(gòu)化內(nèi)容了。 數(shù)據(jù)索引這部分就很簡單了,,有了前面獲取的結(jié)構(gòu)化數(shù)據(jù),,按照elasticsearch,新建一個schema,,然后寫個程序定時把職位數(shù)據(jù)添加到es的索引里面就行了,。因為職位詳情的內(nèi)容有點多,我就沒有把content字段添加到索引里面了,,因為太占內(nèi)存了,,服務(wù)器內(nèi)存不夠用了,>_<,。 DEMO最后還是貼上網(wǎng)址供大家檢閱,,job search engine。
|