Initial commit

2026-04-21 00:15:06 +10:00
commit 23ab1ac378
6 changed files with 1377 additions and 0 deletions
@@ -0,0 +1,110 @@
+import axios from 'axios';
+import * as cheerio from 'cheerio';
+import * as fs from 'fs';
+import * as path from 'path';
+import https from 'https';
+import { pipeline } from 'stream/promises';
+
+https.globalAgent.setMaxListeners(50);
+
+const TARGET_URL = 'https://norebbostock.com/collections/all';
+const OUTPUT_DIR = './scraped_liveries';
+
+async function downloadImage(imageUrl: string, filename: string): Promise<void> {
+    const filepath = path.join(OUTPUT_DIR, filename);
+    const response = await axios.get(imageUrl, { responseType: 'stream' });
+    const writer = fs.createWriteStream(filepath);
+    await pipeline(response.data, writer);
+}
+
+function getHighestResSrc(srcset: string): string | null {
+    const entries = srcset.split(',').map(s => s.trim());
+    const last = entries[entries.length - 1];
+    if (!last) return null;
+    const url = last.split(' ')[0];
+    if (!url) return null;
+    return url.startsWith('//') ? 'https:' + url : url;
+}
+
+function toFilename(heading: string): string {
+    return heading
+        .replace(/\bIllustration\b/gi, '')
+        .trim()
+        .replace(/[\/\\]/g, '-')
+        .replace(/\s+/g, '-')
+        .replace(/-+/g, '-')
+        .toLowerCase();
+}
+
+async function getPageCount(): Promise<number> {
+    const { data } = await axios.get(TARGET_URL, {
+        headers: { 'User-Agent': 'Mozilla/5.0' }
+    });
+    const $ = cheerio.load(data);
+
+    const lastPage = $('.pagination__list li a')
+        .map((_, el) => parseInt($(el).text().trim()))
+        .get()
+        .filter(n => !isNaN(n))
+        .pop();
+
+    return lastPage ?? 1;
+}
+
+async function scrapePage(url: string): Promise<{ url: string; heading: string }[]> {
+    const { data } = await axios.get(url, {
+        headers: { 'User-Agent': 'Mozilla/5.0' }
+    });
+
+    const $ = cheerio.load(data);
+    const images: { url: string; heading: string }[] = [];
+
+    $('.card-wrapper').each((_, el) => {
+        const srcset = $(el).find('.card__media img').attr('srcset');
+        const heading = $(el).find('.card__heading.h5').text().trim();
+
+        if (srcset && heading) {
+            const url = getHighestResSrc(srcset);
+            if (url) images.push({ url, heading });
+        }
+    });
+
+    return images;
+}
+
+async function scrapeImages(): Promise<void> {
+    if (!fs.existsSync(OUTPUT_DIR)) fs.mkdirSync(OUTPUT_DIR);
+
+    const pageCount = await getPageCount();
+    console.log(`Found ${pageCount} pages`);
+
+    for (let page = 1; page <= pageCount; page++) {
+        const url = page === 1 ? TARGET_URL : `${TARGET_URL}?page=${page}`;
+        console.log(`\nScraping page ${page}/${pageCount}...`);
+
+        const images = await scrapePage(url);
+        console.log(`Found ${images.length} images`);
+
+        for (const { url: imgUrl, heading } of images) {
+            const ext = path.extname(new URL(imgUrl).pathname) || '.jpg';
+            const filename = toFilename(heading) + ext;
+
+            // Skip if already downloaded
+            if (fs.existsSync(path.join(OUTPUT_DIR, filename))) {
+                console.log(`⟳ Skipped (exists): ${filename}`);
+                continue;
+            }
+
+            try {
+                await downloadImage(imgUrl, filename);
+                console.log(`✓ ${filename}`);
+            } catch (err) {
+                console.error(`✗ Failed: ${imgUrl}`, (err as any).response?.status ?? err);
+            }
+        }
+    }
+
+    console.log('\nDone!');
+}
+
+scrapeImages();