import axios from 'axios'; import * as cheerio from 'cheerio'; import * as fs from 'fs'; import * as path from 'path'; import https from 'https'; import { pipeline } from 'stream/promises'; https.globalAgent.setMaxListeners(50); const TARGET_URL = 'https://norebbostock.com/collections/all'; const OUTPUT_DIR = './scraped_liveries'; async function downloadImage(imageUrl: string, filename: string): Promise { const filepath = path.join(OUTPUT_DIR, filename); const response = await axios.get(imageUrl, { responseType: 'stream' }); const writer = fs.createWriteStream(filepath); await pipeline(response.data, writer); } function getHighestResSrc(srcset: string): string | null { const entries = srcset.split(',').map(s => s.trim()); const last = entries[entries.length - 1]; if (!last) return null; const url = last.split(' ')[0]; if (!url) return null; return url.startsWith('//') ? 'https:' + url : url; } function toFilename(heading: string): string { return heading .replace(/\bIllustration\b/gi, '') .trim() .replace(/[\/\\]/g, '-') .replace(/\s+/g, '-') .replace(/-+/g, '-') .toLowerCase(); } async function getPageCount(): Promise { const { data } = await axios.get(TARGET_URL, { headers: { 'User-Agent': 'Mozilla/5.0' } }); const $ = cheerio.load(data); const lastPage = $('.pagination__list li a') .map((_, el) => parseInt($(el).text().trim())) .get() .filter(n => !isNaN(n)) .pop(); return lastPage ?? 1; } async function scrapePage(url: string): Promise<{ url: string; heading: string }[]> { const { data } = await axios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0' } }); const $ = cheerio.load(data); const images: { url: string; heading: string }[] = []; $('.card-wrapper').each((_, el) => { const srcset = $(el).find('.card__media img').attr('srcset'); const heading = $(el).find('.card__heading.h5').text().trim(); if (srcset && heading) { const url = getHighestResSrc(srcset); if (url) images.push({ url, heading }); } }); return images; } async function scrapeImages(): Promise { if (!fs.existsSync(OUTPUT_DIR)) fs.mkdirSync(OUTPUT_DIR); const pageCount = await getPageCount(); console.log(`Found ${pageCount} pages`); for (let page = 1; page <= pageCount; page++) { const url = page === 1 ? TARGET_URL : `${TARGET_URL}?page=${page}`; console.log(`\nScraping page ${page}/${pageCount}...`); const images = await scrapePage(url); console.log(`Found ${images.length} images`); for (const { url: imgUrl, heading } of images) { const ext = path.extname(new URL(imgUrl).pathname) || '.jpg'; const filename = toFilename(heading) + ext; // Skip if already downloaded if (fs.existsSync(path.join(OUTPUT_DIR, filename))) { console.log(`⟳ Skipped (exists): ${filename}`); continue; } try { await downloadImage(imgUrl, filename); console.log(`✓ ${filename}`); } catch (err) { console.error(`✗ Failed: ${imgUrl}`, (err as any).response?.status ?? err); } } } console.log('\nDone!'); } scrapeImages();