import type { AmazonPageWorker, AmazonPageWorkerEvents, LanchTaskBaseOptions } from './types'; import type { Tabs } from 'webextension-polyfill'; import { withErrorHandling } from './error-handler'; import { AmazonDetailPageInjector, AmazonReviewPageInjector, AmazonSearchPageInjector, } from './web-injectors/amazon'; import { isForbiddenUrl } from '~/env'; import { BaseWorker } from './base'; /** * AmazonPageWorkerImpl can run on background & sidepanel & popup, * **can't** run on content script! */ class AmazonPageWorkerImpl extends BaseWorker implements AmazonPageWorker { //#region Singleton private static _instance: AmazonPageWorker | null = null; public static getInstance() { if (this._instance === null) { this._instance = new AmazonPageWorkerImpl(); } return this._instance; } protected constructor() { super(); } //#endregion private async getCurrentTab(): Promise { const tab = await browser.tabs .query({ active: true, currentWindow: true }) .then((tabs) => tabs[0]); return tab; } private async createNewTab(url: string): Promise { const tab = await browser.tabs.create({ url, active: true }); return tab; } private async wanderSearchSinglePage(tab: Tabs.Tab) { const injector = new AmazonSearchPageInjector(tab); // Wait for the Next button to appear, indicating that the product items have finished loading await injector.waitForPageLoaded(); // Determine the type of product search page https://github.com/primedigitaltech/azon_seeker/issues/1 const pagePattern = await injector.getPagePattern(); // Retrieve key nodes and their information from the critical product search page const data = await injector.getPageData(pagePattern); // get current page const page = await injector.getCurrentPage(); // Determine if it is the last page, otherwise navigate to the next page const hasNextPage = await injector.determineHasNextPage(); await new Promise((resolve) => setTimeout(resolve, 1000)); if (data === null || typeof hasNextPage !== 'boolean') { this.emit('error', { message: '爬取单页信息失败', url: tab.url }); throw new Error('爬取单页信息失败'); } return { data, hasNextPage, page }; } @withErrorHandling public async doSearch(keywords: string): Promise { const url = new URL('https://www.amazon.com/s'); url.searchParams.append('k', keywords); let tab = await this.getCurrentTab(); if (!tab.url?.startsWith('http')) { tab = await this.createNewTab('https://www.amazon.com/'); tab.url = 'https://www.amazon.com/'; } const currentUrl = new URL(tab.url!); if (currentUrl.hostname !== url.hostname || currentUrl.searchParams.get('k') !== keywords) { tab = await browser.tabs.update(tab.id, { url: url.toString(), active: true }); await new Promise((resolve) => setTimeout(resolve, 1000)); } return url.toString(); } @withErrorHandling public async wanderSearchPage(): Promise { const tab = await this.getCurrentTab(); let offset = 0; while (true) { const { hasNextPage, data, page } = await this.wanderSearchSinglePage(tab); const keywords = new URL(tab.url!).searchParams.get('k')!; const objs = data.map((r, i) => ({ ...r, keywords, page, rank: offset + 1 + i, createTime: dayjs().format('YYYY/M/D HH:mm:ss'), asin: /(?<=\/dp\/)[A-Z0-9]{10}/.exec(r.link as string)![0], })); this.emit('item-links-collected', { objs }); offset += data.length; if (!hasNextPage) { break; } } return new Promise((resolve) => setTimeout(resolve, 1000)); } @withErrorHandling public async wanderDetailPage(entry: string, aplus: boolean = false) { //#region Initial Meta Info const params = { asin: '', url: '' }; if (entry.match(/^https?:\/\/www\.amazon\.com.*\/dp\/[A-Z0-9]{10}/)) { const [asin] = /\/\/dp\/[A-Z0-9]{10}/.exec(entry)!; params.asin = asin; params.url = entry; } else if (entry.match(/^[A-Z0-9]{10}$/)) { params.asin = entry; params.url = `https://www.amazon.com/dp/${entry}`; } let tab = await this.getCurrentTab(); if (!tab.url || isForbiddenUrl(tab.url)) { tab = await this.createNewTab(params.url); } else { tab = await browser.tabs.update(tab.id, { url: params.url, }); } const injector = new AmazonDetailPageInjector(tab); //#endregion //#region Await Production Introduction Element Loaded await injector.waitForPageLoaded(); await new Promise((resolve) => setTimeout(resolve, 3000)); // Wait 3 seconds. //#endregion //#region Fetch Base Info const baseInfo = await injector.getBaseInfo(); const ratingInfo = await injector.getRatingInfo(); await this.emit('item-base-info-collected', { asin: params.asin, ...baseInfo, ...ratingInfo, timestamp: dayjs().format('YYYY/M/D HH:mm:ss'), }); //#endregion //#region Fetch Category Rank Info let rawRankingText: string | null = await injector.getRankText(); if (rawRankingText) { const info: Pick = {}; let statement = /#[0-9,]+\sin\s\S[\s\w',\.&\(\)\-]+/.exec(rawRankingText)?.[0]; if (statement) { const name = /(?<=in\s).+/.exec(statement)?.[0].replace(/\s\(See\sTop.+\)/, ''); const rank = Number(/(?<=#)[0-9,]+/.exec(statement)?.[0].replaceAll(',', '')); if (name && !Number.isNaN(rank)) { info['category1'] = { name, rank }; } rawRankingText = rawRankingText.replace(statement, ''); } statement = /#[0-9,]+\sin\s\S[\s\w',\.&\(\)\-]+/.exec(rawRankingText)?.[0]; if (statement) { const name = /(?<=in\s).+/.exec(statement)?.[0].replace(/[\s]+$/, ''); const rank = Number(/(?<=#)[0-9,]+/.exec(statement)?.[0].replaceAll(',', '')); if (name && !Number.isNaN(rank)) { info['category2'] = { name, rank }; } } await this.emit('item-category-rank-collected', { asin: params.asin, ...info, }); } //#endregion //#region Fetch Goods' Images const imageUrls = await injector.getImageUrls(); imageUrls.length > 0 && (await this.emit('item-images-collected', { asin: params.asin, imageUrls: Array.from(new Set(imageUrls)), })); await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds. //#endregion //#region Fetch Top Reviews // const reviews = await injector.getTopReviews(); // reviews.length > 0 && // await this.emit('item-top-reviews-collected', { // asin: params.asin, // topReviews: reviews, // }); //#endregion // #region Get APlus Sreen shot if (aplus && (await injector.scanAPlus())) { const { b64: base64data } = await injector.captureAPlus(); await this.emit('item-aplus-screenshot-collect', { asin: params.asin, base64data }); } // #endregion } @withErrorHandling public async wanderReviewPage(asin: string) { const url = new URL( `https://www.amazon.com/product-reviews/${asin}/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews`, ); const tab = await this.createNewTab(url.toString()); const injector = new AmazonReviewPageInjector(tab); await injector.waitForPageLoad(); for (let star = 1; star <= 5; star++) { await injector.showStarsDropDownMenu(); await injector.selectStar(star); while (true) { await injector.waitForPageLoad(); const reviews = await injector.getSinglePageReviews(); reviews.length > 0 && this.emit('item-review-collected', { asin, reviews }); const hasNextPage = await injector.jumpToNextPageIfExist(); if (!hasNextPage) { break; } } } setTimeout(() => browser.tabs.remove(tab.id!), 1000); } public async runSearchPageTask( keywordsList: string[], options: LanchTaskBaseOptions = {}, ): Promise { const { progress } = options; let remains = [...keywordsList]; let interrupt = false; const unsubscribe = this.on('interrupt', () => { interrupt = true; }); while (remains.length > 0 && !interrupt) { const kw = remains.shift()!; await this.doSearch(kw); await this.wanderSearchPage(); progress && progress(remains); } unsubscribe(); } public async runDetailPageTask( asins: string[], options: LanchTaskBaseOptions & { aplus?: boolean } = {}, ): Promise { const { progress, aplus = false } = options; const remains = [...asins]; let interrupt = false; const unsubscribe = this.on('interrupt', () => { interrupt = true; }); while (remains.length > 0 && !interrupt) { const asin = remains.shift()!; await this.wanderDetailPage(asin, aplus); progress && progress(remains); } unsubscribe(); } public async runReviewPageTask( asins: string[], options: LanchTaskBaseOptions = {}, ): Promise { const { progress } = options; const remains = [...asins]; let interrupt = false; const unsubscribe = this.on('interrupt', () => { interrupt = true; }); while (remains.length > 0 && !interrupt) { const asin = remains.shift()!; await this.wanderReviewPage(asin); progress && progress(remains); } unsubscribe(); } public stop(): Promise { return this.emit('interrupt'); } } export default { getAmazonPageWorker(): AmazonPageWorker { return AmazonPageWorkerImpl.getInstance(); }, };