From 92c80b8e4a296e305ddd99b691e481312e81c0a7 Mon Sep 17 00:00:00 2001 From: Xavier Saliniere Date: Mon, 15 Sep 2025 15:46:16 -0400 Subject: [PATCH 1/2] feat: implement access now crawling config --- README.md | 4 + src/config/sources/an.ts | 59 ++++ src/config/sources/duk.ts | 28 +- src/config/sources/eff.ts | 59 ++-- src/config/sources/fpf.ts | 43 ++- src/config/sources/index.ts | 2 + src/config/sources/lpe.ts | 33 +- src/config/sources/p2p.ts | 33 +- src/config/sources/tf.ts | 28 +- src/core/types.ts | 29 +- src/crawlers/ArticleListingCrawler.ts | 17 - src/crawlers/CrawlErrorManager.ts | 43 --- src/crawlers/MetadataTracker.ts | 21 +- .../extractors/BrowserFieldExtractor.ts | 15 +- .../extractors/ContentPageExtractor.ts | 34 +- .../extractors/ListingPageExtractor.ts | 204 +++++++----- src/crawlers/handlers/PaginationHandler.ts | 25 +- .../__fixtures__/an/biden-digital-rights.ts | 53 ++++ .../an/kenya-sim-card-biometrics.ts | 41 +++ .../an/russias-record-war-on-connectivity.ts | 27 ++ ...-challenged-release-transparency-report.ts | 29 ++ src/tests/crawlers/CrawlErrorManager.test.ts | 94 ++---- .../crawlers/MetadataTracker.basic.test.ts | 53 +--- .../crawlers/MetadataTracker.items.test.ts | 23 +- .../MetadataTracker.return-value.test.ts | 20 +- .../crawlers/MetadataTracker.session.test.ts | 23 +- .../extractors/BrowserFieldExtractor.test.ts | 36 --- .../BrowserFieldExtractor.whitespace.test.ts | 42 --- .../ConcurrentContentExtractor.test.ts | 8 +- .../extractors/ContentDataMapper.test.ts | 14 +- .../extractors/ContentPageExtractor.test.ts | 8 +- .../extractors/ListingPageExtractor.test.ts | 293 +++++++----------- .../ListingPageExtractor.whitespace.test.ts | 61 ++-- .../PaginationHandler.navigation.test.ts | 8 +- src/tests/integration/an-integration.test.ts | 92 ++++++ src/tests/ui/formatter.test.ts | 1 - 36 files changed, 810 insertions(+), 793 deletions(-) create mode 100644 src/config/sources/an.ts create mode 100644 src/tests/__fixtures__/an/biden-digital-rights.ts create mode 100644 src/tests/__fixtures__/an/kenya-sim-card-biometrics.ts create mode 100644 src/tests/__fixtures__/an/russias-record-war-on-connectivity.ts create mode 100644 src/tests/__fixtures__/an/vodafone-challenged-release-transparency-report.ts create mode 100644 src/tests/integration/an-integration.test.ts diff --git a/README.md b/README.md index 8c09e47..317ebe8 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,10 @@ Options: -h, --help display help for command ``` +##### Notes + +- Avoid running multiple crawl operations at the same time, as currently encountering database lock triggers an exception which ends the crawling (could be improved) + #### Serve - Starting the API server ``` diff --git a/src/config/sources/an.ts b/src/config/sources/an.ts new file mode 100644 index 0000000..f03a055 --- /dev/null +++ b/src/config/sources/an.ts @@ -0,0 +1,59 @@ +import type { SourceConfig } from "@/core/types"; + +export const anSource: SourceConfig = { + id: "an", + name: "Access Now", + type: "listing", + listing: { + url: "https://www.accessnow.org/news-updates/?_language=english", + pagination: { + next_button_selector: ".post-grid-pagination .facetwp-page.next", + delaySec: 10, // Access Now blocks IP address when it recognizes a crawling operation + }, + container_selector: ".post-grid.facetwp-template .post-grid-item", + shouldExcludeItem: (containerHtml, values) => { + const excludedPaths = [ + "accessnow.org/press-release", + "accessnow.org/guide", + ]; + return ( + containerHtml.includes("post-grid-item--external-icon") || + !!excludedPaths.filter((path) => values?.url?.includes(path)).length + ); + }, + fields: { + title: { + selector: ".post-grid-item--title", + attribute: "text", + }, + url: { + selector: ".post-grid-item--link", + attribute: "href", + }, + date: { + selector: ".post-grid-item--date", + attribute: "text", + }, + }, + }, + content: { + container_selector: "#post-container", + fields: { + title: { + selector: "header h1", + attribute: "text", + optional: true, + }, + content: { + selector: ".entry-content", + attribute: "node", + }, + author: { + selector: "#authors", + exclude_selectors: [".profilePic", ".authorInfo > a"], + attribute: "text", + optional: true, + }, + }, + }, +}; diff --git a/src/config/sources/duk.ts b/src/config/sources/duk.ts index 5317d40..798b962 100644 --- a/src/config/sources/duk.ts +++ b/src/config/sources/duk.ts @@ -10,21 +10,19 @@ export const dukSource: SourceConfig = { pagination: { next_button_selector: ".wp-pagenavi .nextpostslink", }, - items: { - container_selector: ".blog-with-tags.ls-archive-blog .et_pb_post", - fields: { - title: { - selector: ".entry-title", - attribute: "text", - }, - url: { - selector: ".entry-title a", - attribute: "href", - }, - date: { - selector: ".post-meta .published", - attribute: "text", - }, + container_selector: ".blog-with-tags.ls-archive-blog .et_pb_post", + fields: { + title: { + selector: ".entry-title", + attribute: "text", + }, + url: { + selector: ".entry-title a", + attribute: "href", + }, + date: { + selector: ".post-meta .published", + attribute: "text", }, }, }, diff --git a/src/config/sources/eff.ts b/src/config/sources/eff.ts index ae1a75d..b554ebe 100644 --- a/src/config/sources/eff.ts +++ b/src/config/sources/eff.ts @@ -4,42 +4,39 @@ export const effSource: SourceConfig = { id: "eff", name: "Electronic Frontier Foundation", type: "listing", - content_url_excludes: [ - "eff.org/event/", - "eff.org/wp/", - "eff.org/cases/", - "eff.org/calendar/", - ], listing: { url: "https://eff.org/updates", pagination: { next_button_selector: ".pager__item.pager__item--next a", }, - items: { - container_selector: ".views-row article.node", - fields: { - title: { - selector: ".node__title", - attribute: "text", - }, - url: { - selector: ".node__title a", - attribute: "href", - }, - date: { - selector: ".node-date", - attribute: "text", - }, - excerpt: { - selector: ".node__content", - attribute: "text", - optional: true, - }, - author: { - selector: ".node-author", - attribute: "text", - optional: true, - }, + container_selector: ".views-row article.node", + shouldExcludeItem: (_, values) => { + const excludedPaths = [ + "eff.org/event/", + "eff.org/wp/", + "eff.org/cases/", + "eff.org/calendar/", + ]; + return !!excludedPaths.filter((path) => values?.url?.includes(path)) + .length; + }, + fields: { + title: { + selector: ".node__title", + attribute: "text", + }, + url: { + selector: ".node__title a", + attribute: "href", + }, + date: { + selector: ".node-date", + attribute: "text", + }, + author: { + selector: ".node-author", + attribute: "text", + optional: true, }, }, }, diff --git a/src/config/sources/fpf.ts b/src/config/sources/fpf.ts index 9a4aa6d..7759f56 100644 --- a/src/config/sources/fpf.ts +++ b/src/config/sources/fpf.ts @@ -9,31 +9,24 @@ export const fpfSource: SourceConfig = { pagination: { next_button_selector: ".pagination .pagination-link:nth-of-type(2)", }, - items: { - container_selector: ".article-list .card-listing", - fields: { - title: { - selector: ".heading .card-link", - attribute: "text", - }, - url: { - selector: ".heading .card-link", - attribute: "href", - }, - date: { - selector: ".meta-info time", - attribute: "datetime", - }, - excerpt: { - selector: ".inner-content p", - attribute: "text", - optional: true, - }, - author: { - selector: ".meta-info .card-meta-link:not(:nth-child(1))", - attribute: "text", - optional: true, - }, + container_selector: ".article-list .card-listing", + fields: { + title: { + selector: ".heading .card-link", + attribute: "text", + }, + url: { + selector: ".heading .card-link", + attribute: "href", + }, + date: { + selector: ".meta-info time", + attribute: "datetime", + }, + author: { + selector: ".meta-info .card-meta-link:not(:nth-child(1))", + attribute: "text", + optional: true, }, }, }, diff --git a/src/config/sources/index.ts b/src/config/sources/index.ts index 62e9109..1b0a81f 100644 --- a/src/config/sources/index.ts +++ b/src/config/sources/index.ts @@ -1,4 +1,5 @@ import type { SourceConfig } from "@/core/types"; +import { anSource } from "./an.js"; import { dukSource } from "./duk.js"; import { effSource } from "./eff.js"; import { fpfSource } from "./fpf.js"; @@ -13,4 +14,5 @@ export const sources: SourceConfig[] = [ p2pSource, dukSource, tfSource, + anSource, ]; diff --git a/src/config/sources/lpe.ts b/src/config/sources/lpe.ts index 7c90c12..c15025b 100644 --- a/src/config/sources/lpe.ts +++ b/src/config/sources/lpe.ts @@ -9,26 +9,19 @@ export const lpeSource: SourceConfig = { pagination: { next_button_selector: "", }, - items: { - container_selector: ".section .post-card", - fields: { - title: { - selector: ".post-card__title", - attribute: "text", - }, - url: { - selector: ".post-card__title", - attribute: "href", - }, - date: { - selector: ".post-card__label span:nth-of-type(2)", - attribute: "text", - }, - excerpt: { - selector: ".post-card__content > span", - attribute: "text", - optional: true, - }, + container_selector: ".section .post-card", + fields: { + title: { + selector: ".post-card__title", + attribute: "text", + }, + url: { + selector: ".post-card__title", + attribute: "href", + }, + date: { + selector: ".post-card__label span:nth-of-type(2)", + attribute: "text", }, }, }, diff --git a/src/config/sources/p2p.ts b/src/config/sources/p2p.ts index a4824a8..9c8c6a6 100644 --- a/src/config/sources/p2p.ts +++ b/src/config/sources/p2p.ts @@ -10,26 +10,19 @@ export const p2pSource: SourceConfig = { pagination: { next_button_selector: ".nav-previous a", }, - items: { - container_selector: ".blog-masonry article", - fields: { - title: { - selector: ".entry-title", - attribute: "text", - }, - url: { - selector: ".entry-title a", - attribute: "href", - }, - date: { - selector: ".entry-date", - attribute: "text", - }, - excerpt: { - selector: ".entry-content", - attribute: "text", - optional: true, - }, + container_selector: ".blog-masonry article", + fields: { + title: { + selector: ".entry-title", + attribute: "text", + }, + url: { + selector: ".entry-title a", + attribute: "href", + }, + date: { + selector: ".entry-date", + attribute: "text", }, }, }, diff --git a/src/config/sources/tf.ts b/src/config/sources/tf.ts index c75d8d7..f6c67d3 100644 --- a/src/config/sources/tf.ts +++ b/src/config/sources/tf.ts @@ -9,21 +9,19 @@ export const tfSource: SourceConfig = { pagination: { next_button_selector: ".page__navigation .navigation__link.next", }, - items: { - container_selector: ".page__content .preview-article", - fields: { - title: { - selector: ".preview-article__title", - attribute: "text", - }, - url: { - selector: "& > a", - attribute: "href", - }, - date: { - selector: ".preview-article__published time", - attribute: "text", - }, + container_selector: ".page__content .preview-article", + fields: { + title: { + selector: ".preview-article__title", + attribute: "text", + }, + url: { + selector: "& > a", + attribute: "href", + }, + date: { + selector: ".preview-article__published time", + attribute: "text", }, }, }, diff --git a/src/core/types.ts b/src/core/types.ts index 1276ed1..ccf6bca 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -1,5 +1,11 @@ -// Core crawler types and interfaces - +import type { + ContentFieldName, + ExtractedContentValues, +} from "@/crawlers/extractors/ContentPageExtractor"; +import type { + ExtractedListingValues, + ListingFieldName, +} from "@/crawlers/extractors/ListingPageExtractor"; import type { StoppedReason } from "@/crawlers/MetadataTracker"; export const CRAWLER_TYPES = { @@ -30,30 +36,29 @@ export interface FieldConfig { export interface PaginationConfig { next_button_selector?: string; - maxPages?: number; -} - -export interface ItemsConfig { - container_selector: string; - fields: Record; + delaySec?: number; } export interface ListingConfig { url: string; pagination?: PaginationConfig; - items: ItemsConfig; + container_selector: string; + shouldExcludeItem?: ( + containerHtml: string, + values?: ExtractedListingValues, + ) => boolean; + fields: Partial>; } export interface ContentConfig { container_selector: string; - fields: Record; + fields: Partial>; } export interface SourceConfig { id: string; name: string; type: CrawlerType; - content_url_excludes?: string[]; // URL patterns to exclude from content extraction disableJavascript?: boolean; listing: ListingConfig; content: ContentConfig; @@ -122,7 +127,7 @@ export interface CrawlOptions { // Statistics and metadata export interface FieldExtractionStats { - fieldName: string; + fieldName: ListingFieldName | ContentFieldName; successCount: number; totalAttempts: number; isOptional: boolean; diff --git a/src/crawlers/ArticleListingCrawler.ts b/src/crawlers/ArticleListingCrawler.ts index e4aefc6..9339191 100644 --- a/src/crawlers/ArticleListingCrawler.ts +++ b/src/crawlers/ArticleListingCrawler.ts @@ -64,15 +64,6 @@ async function processPageItems( pageResult.filteredReasons, ); - // Track field extraction warnings - const fieldWarnings = pageResult.filteredReasons.filter( - (reason) => - reason.includes("Optional field") || reason.includes("Required field"), - ); - if (fieldWarnings.length > 0) { - metadataTracker.addFieldExtractionWarnings(fieldWarnings); - } - // Filter out duplicates const newItems = filterDuplicates(pageResult.items, seenUrls); const sessionDuplicatesSkipped = pageResult.items.length - newItems.length; @@ -144,14 +135,6 @@ async function processContentExtraction( if (metadata.contentErrors.length > 0) { metadataTracker.addContentErrors(metadata.contentErrors); - metadata.contentErrors.length = 0; - } - - const contentWarnings = metadata.contentErrors.filter( - (error) => error.includes("Optional field") || error.includes("not found"), - ); - if (contentWarnings.length > 0) { - metadataTracker.addFieldExtractionWarnings(contentWarnings); } if (options?.onPageComplete) { diff --git a/src/crawlers/CrawlErrorManager.ts b/src/crawlers/CrawlErrorManager.ts index 71f446e..131c1c4 100644 --- a/src/crawlers/CrawlErrorManager.ts +++ b/src/crawlers/CrawlErrorManager.ts @@ -4,36 +4,11 @@ export type CrawlErrorType = "listing" | "content"; export interface CrawlErrorManager { addErrors(type: CrawlErrorType, errors: string[]): void; - addErrorsWithCategorization(errors: string[]): void; addListingErrors(errors: string[]): void; addContentErrors(errors: string[]): void; - addFieldExtractionWarnings(warnings: string[]): void; getSessionErrors(): { listingErrors: string[]; contentErrors: string[] }; } -export function categorizeErrors(errors: string[]): { - listingErrors: string[]; - contentErrors: string[]; -} { - const listingErrors: string[] = []; - const contentErrors: string[] = []; - - for (const error of errors) { - if ( - error.includes("Optional field") || - error.includes("Required field") || - error.includes("missing required fields") || - error.includes("no extractable data") - ) { - listingErrors.push(error); - } else { - contentErrors.push(error); - } - } - - return { listingErrors, contentErrors }; -} - export function createCrawlErrorManager( metadataStore: MetadataStore, sessionId: string, @@ -44,20 +19,6 @@ export function createCrawlErrorManager( metadataStore.addSessionErrors(sessionId, type, errors); }, - addErrorsWithCategorization(errors: string[]): void { - if (errors.length === 0) return; - - const { listingErrors, contentErrors } = categorizeErrors(errors); - - if (listingErrors.length > 0) { - this.addErrors("listing", listingErrors); - } - - if (contentErrors.length > 0) { - this.addErrors("content", contentErrors); - } - }, - addListingErrors(errors: string[]): void { this.addErrors("listing", errors); }, @@ -66,10 +27,6 @@ export function createCrawlErrorManager( this.addErrors("content", errors); }, - addFieldExtractionWarnings(warnings: string[]): void { - this.addErrorsWithCategorization(warnings); - }, - getSessionErrors(): { listingErrors: string[]; contentErrors: string[] } { const session = metadataStore.getSession(sessionId); if (!session) { diff --git a/src/crawlers/MetadataTracker.ts b/src/crawlers/MetadataTracker.ts index b6b931f..ea9f98d 100644 --- a/src/crawlers/MetadataTracker.ts +++ b/src/crawlers/MetadataTracker.ts @@ -4,6 +4,7 @@ import type { CrawlMetadata, CrawlResult, FieldConfig, + FieldExtractionStats, SourceConfig, } from "@/core/types"; import { createCrawlErrorManager } from "@/crawlers/CrawlErrorManager"; @@ -13,6 +14,8 @@ import { } from "@/storage/MetadataStore"; import { getStoragePath } from "@/utils/storagePath.js"; import { buildCrawlSummary } from "@/utils/summaryBuilder"; +import type { ContentFieldName } from "./extractors/ContentPageExtractor"; +import type { ListingFieldName } from "./extractors/ListingPageExtractor"; export enum MetadataActionType { ADD_ITEMS = "ADD_ITEMS", @@ -164,7 +167,10 @@ function createInitialMetadata( config: SourceConfig, sessionId: string, ): MetadataState { - const initFieldStats = (fieldName: string, fieldConfig: FieldConfig) => ({ + const initFieldStats = ( + fieldName: ListingFieldName | ContentFieldName, + fieldConfig: FieldConfig, + ): FieldExtractionStats => ({ fieldName, successCount: 0, totalAttempts: 0, @@ -182,11 +188,13 @@ function createInitialMetadata( itemsProcessed: 0, pagesProcessed: 0, contentsCrawled: 0, - fieldStats: Object.entries(config.listing.items.fields).map( - ([fieldName, fieldConfig]) => initFieldStats(fieldName, fieldConfig), + fieldStats: Object.entries(config.listing.fields).map( + ([fieldName, fieldConfig]) => + initFieldStats(fieldName as ListingFieldName, fieldConfig), ), contentFieldStats: Object.entries(config.content.fields).map( - ([fieldName, fieldConfig]) => initFieldStats(fieldName, fieldConfig), + ([fieldName, fieldConfig]) => + initFieldStats(fieldName as ContentFieldName, fieldConfig), ), listingErrors: [], contentErrors: [], @@ -214,7 +222,6 @@ export interface MetadataTracker extends ContentSessionLinker { addUrlsExcluded(count: number): MetadataState; addFilteredItems(count: number, reasons: string[]): MetadataState; addContentErrors(errors: string[]): void; - addFieldExtractionWarnings(warnings: string[]): void; addContentsCrawled(count: number): MetadataState; setStoppedReason(reason: StoppedReason): MetadataState; buildCrawlResult(): CrawlResult; @@ -378,10 +385,6 @@ export function createMetadataTracker( errorManager.addContentErrors(errors); }, - addFieldExtractionWarnings(warnings: string[]): void { - errorManager.addFieldExtractionWarnings(warnings); - }, - addContentsCrawled(count: number): MetadataState { const state = updateState({ type: MetadataActionType.ADD_CONTENTS_CRAWLED, diff --git a/src/crawlers/extractors/BrowserFieldExtractor.ts b/src/crawlers/extractors/BrowserFieldExtractor.ts index f318c60..07f24b8 100644 --- a/src/crawlers/extractors/BrowserFieldExtractor.ts +++ b/src/crawlers/extractors/BrowserFieldExtractor.ts @@ -128,17 +128,10 @@ export function createBrowserExtractionFunction() { const value = extractFieldValue(element, typedFieldConfig); results[fieldName] = value && value !== "" ? value : null; - // Log extraction issues for both required and optional fields - if (!value || value === "") { - if (typedFieldConfig.optional) { - extractionErrors.push( - `Optional field '${fieldName}' not found: selector '${typedFieldConfig.selector}' returned no results`, - ); - } else { - extractionErrors.push( - `Required field '${fieldName}' not found: selector '${typedFieldConfig.selector}' returned no results`, - ); - } + if (!typedFieldConfig.optional && (!value || value === "")) { + extractionErrors.push( + `Required field '${fieldName}' not found: selector '${typedFieldConfig.selector}' returned no results`, + ); } } catch (error) { extractionErrors.push(`Failed to extract ${fieldName}: ${error}`); diff --git a/src/crawlers/extractors/ContentPageExtractor.ts b/src/crawlers/extractors/ContentPageExtractor.ts index 361b07d..0094151 100644 --- a/src/crawlers/extractors/ContentPageExtractor.ts +++ b/src/crawlers/extractors/ContentPageExtractor.ts @@ -18,6 +18,13 @@ import type { MetadataStore } from "@/storage/MetadataStore.js"; import { resolveAbsoluteUrl } from "@/utils/url.js"; import type { BrowserHandler } from "../handlers/BrowserHandler"; +export interface ExtractedContentValues { + title: string | null; + author: string | null; + content: string | null; +} +export type ContentFieldName = keyof ExtractedContentValues; + export interface ContentPageExtractor { extractContentPagesConcurrently: ( items: CrawledData[], @@ -148,8 +155,6 @@ async function extractContentForSingleItem( ): Promise { if (!item.url) return; - const hasExcerpt = item.content && item.content.trim().length > 0; - try { const { contentData, errors } = await extractFromContentPage( browser, @@ -169,30 +174,15 @@ async function extractContentForSingleItem( updateItemMetadata(item, contentFields, failedContentFields, errors); if (errors.length > 0) { - if (hasExcerpt) { - contentErrors.push( - ...errors.map( - (err) => `Content extraction warning for ${item.url} : ${err}`, - ), - ); - } else { - const errorMessage = `Content extraction failed for ${item.url} (no excerpt available): ${errors.join(", ")}`; - contentErrors.push(errorMessage); - } + const errorMessage = `Content extraction failed for ${item.url} : ${errors.join(", ")}`; + contentErrors.push(errorMessage); } } catch (error) { const errorMessage = `Failed to extract content data for ${item.url} : ${error}`; - if (hasExcerpt) { - console.error(`Content extraction warning for ${item.url}`, error); - contentErrors.push( - `Content extraction warning for ${item.url} : ${errorMessage}`, - ); - } else { - contentErrors.push( - `Content extraction failed for ${item.url} : ${errorMessage}`, - ); - } + contentErrors.push( + `Content extraction failed for ${item.url} : ${errorMessage}`, + ); updateItemMetadata(item, [], [], [errorMessage]); } diff --git a/src/crawlers/extractors/ListingPageExtractor.ts b/src/crawlers/extractors/ListingPageExtractor.ts index 18db0fb..40d8d5f 100644 --- a/src/crawlers/extractors/ListingPageExtractor.ts +++ b/src/crawlers/extractors/ListingPageExtractor.ts @@ -1,12 +1,15 @@ import type { Page } from "puppeteer"; import type { CrawledData, + FieldConfig, FieldExtractionStats, + ListingConfig, SourceConfig, } from "@/core/types.js"; import { CRAWLER_TYPES } from "@/core/types.js"; import { DYNAMIC_CONTENT_TIMEOUT } from "@/crawlers/extractors/constants"; import { parsePublishedDate } from "@/utils/date.js"; +import type { ContentFieldName } from "./ContentPageExtractor"; export interface ListingExtractionResult { items: CrawledData[]; @@ -15,13 +18,21 @@ export interface ListingExtractionResult { filteredReasons: string[]; } +export interface ExtractedListingValues { + title: string | null; + url: string | null; + date: string | null; + author: string | null; +} +export type ListingFieldName = keyof ExtractedListingValues; + interface ExtractionResult { - item: Record; + values: ExtractedListingValues; fieldResults: Record< - string, + ListingFieldName | ContentFieldName, { success: boolean; value: string | null; error?: string } >; - hasExcludedUrl: boolean; + isExcluded: boolean; hasRequiredFields: boolean; missingRequiredFields: string[]; extractionErrors: string[]; @@ -43,18 +54,23 @@ async function extractItemsFromPage( currentItemOffset: number, ): Promise { try { - await page.waitForSelector(config.listing.items.container_selector, { + await page.waitForSelector(config.listing.container_selector, { timeout: DYNAMIC_CONTENT_TIMEOUT, }); } catch (error) { console.warn( - `Warning: Container selector "${config.listing.items.container_selector}" not found within ${DYNAMIC_CONTENT_TIMEOUT / 1000} seconds`, + `Warning: Container selector "${config.listing.container_selector}" not found within ${DYNAMIC_CONTENT_TIMEOUT / 1000} seconds`, error, ); } + const excludeFunction = config.listing.shouldExcludeItem + ? config.listing.shouldExcludeItem + : () => false; + await page.exposeFunction("isExcluded", excludeFunction); + const extractionResult = await page.evaluate( - (itemsConfig, excludedUrlPaths) => { + async (listingConfig: ListingConfig) => { function extractFieldValue( element: Element | null, fieldConfig: { @@ -116,99 +132,115 @@ async function extractItemsFromPage( } const containers = document.querySelectorAll( - itemsConfig.container_selector, + listingConfig.container_selector, ); const results: ExtractionResult[] = []; - containers.forEach((container) => { - const item: Record = {}; - const fieldResults: Record< - string, - { success: boolean; value: string | null; error?: string } - > = {}; - let hasRequiredFields = true; - const missingRequiredFields: string[] = []; - const extractionErrors: string[] = []; - - for (const [fieldName, fieldConfig] of Object.entries( - itemsConfig.fields, - )) { - let success = false; - let value: string | null = null; - let error: string | undefined; - - const typedFieldConfig = fieldConfig as { - selector: string; - attribute: string; - exclude_selectors?: string[]; - optional?: boolean; + await Promise.all( + [...containers].map(async (container) => { + const extractedValues: ExtractedListingValues = { + title: null, + url: null, + date: null, + author: null, }; + const fieldResults: Record< + string, + { success: boolean; value: string | null; error?: string } + > = {}; + let hasRequiredFields = true; + const missingRequiredFields: string[] = []; + const extractionErrors: string[] = []; + + for (const [fieldName, fieldConfig] of Object.entries( + listingConfig.fields, + ) as [ListingFieldName, FieldConfig][]) { + let success = false; + let value: string | null = null; + let error: string | undefined; + + const typedFieldConfig = fieldConfig as { + selector: string; + attribute: string; + exclude_selectors?: string[]; + optional?: boolean; + }; - try { - const element = container.querySelector(typedFieldConfig.selector); - if (element) { - value = extractFieldValue(element, typedFieldConfig); + try { + const element = container.querySelector( + typedFieldConfig.selector, + ); + if (element) { + value = extractFieldValue(element, typedFieldConfig); + } + success = value !== null && value !== ""; + } catch (err) { + error = + err instanceof Error + ? err.message + : `Unknown error extracting field ${fieldName}`; + extractionErrors.push( + `Field '${fieldName}' extraction failed: ${error}`, + ); } - success = value !== null && value !== ""; - } catch (err) { - error = - err instanceof Error - ? err.message - : `Unknown error extracting field ${fieldName}`; - extractionErrors.push( - `Field '${fieldName}' extraction failed: ${error}`, - ); - } - fieldResults[fieldName] = { success, value, error }; + fieldResults[fieldName] = { success, value, error }; - if (success) { - item[fieldName] = value; - } else if (!typedFieldConfig.optional) { - hasRequiredFields = false; - missingRequiredFields.push(fieldName); + if (success) { + extractedValues[fieldName] = value; + } else if (!typedFieldConfig.optional) { + hasRequiredFields = false; + missingRequiredFields.push(fieldName); + } } - } - const hasExcludedUrl = !!excludedUrlPaths.filter((path) => - item.url?.includes(path), - ).length; - - results.push({ - item, - fieldResults, - hasExcludedUrl, - hasRequiredFields, - missingRequiredFields, - extractionErrors, - }); - }); + const isExcluded = await ( + window as unknown as { + isExcluded: ( + a: string, + b: ExtractedListingValues, + ) => Promise; + } + ).isExcluded(container.outerHTML, extractedValues); + console.log(isExcluded); + + results.push({ + values: extractedValues, + fieldResults, + isExcluded, + hasRequiredFields, + missingRequiredFields, + extractionErrors, + }); + }), + ); return results; }, - config.listing.items, - config.content_url_excludes ?? [], + config.listing, ); + page.removeExposedFunction("isExcluded"); const validItems = extractionResult.filter( (result: ExtractionResult) => - !result.hasExcludedUrl && + !result.isExcluded && result.hasRequiredFields && - Object.keys(result.item).length > 0, + Object.keys(result.values).length > 0, ); const filteredItems = extractionResult.filter( (result: ExtractionResult) => - result.hasExcludedUrl || + result.isExcluded || !result.hasRequiredFields || - Object.keys(result.item).length === 0, + Object.keys(result.values).length === 0, ); + extractItemsFromPage; const excludedUrls: string[] = []; const filteredReasons: string[] = []; filteredItems.forEach((result: ExtractionResult) => { - if (result.hasExcludedUrl && result.item.url) { - excludedUrls.push(result.item.url); + if (result.isExcluded && result.values.url) { + excludedUrls.push(result.values.url); return; } // Add extraction errors first @@ -218,12 +250,12 @@ async function extractItemsFromPage( }); } - if (Object.keys(result.item).length === 0) { + if (Object.keys(result.values).length === 0) { filteredReasons.push("Item contained no extractable data"); } else if (!result.hasRequiredFields) { const missingFields = result.missingRequiredFields.join(", "); const itemIdentifier = - result.item.url || result.item.title || "Unknown item"; + result.values.url || result.values.title || "Unknown item"; filteredReasons.push( `Item "${itemIdentifier}" missing required fields: ${missingFields}. Seen at ${page.url()}`, ); @@ -233,15 +265,17 @@ async function extractItemsFromPage( }); extractionResult - .filter((result) => !result.hasExcludedUrl) + .filter((result) => !result.isExcluded) .forEach((result: ExtractionResult, itemIndex: number) => { const itemIdentifier = - result.item.url || result.item.title || `Item ${itemIndex + 1}`; + result.values.url || result.values.title || `Item ${itemIndex + 1}`; Object.entries(result.fieldResults).forEach( ([fieldName, fieldResult]) => { if (!fieldResult.success) { - const fieldConfig = config.listing.items.fields[fieldName] as { + const fieldConfig = config.listing.fields[ + fieldName as ListingFieldName + ] as { selector: string; attribute: string; optional?: boolean; @@ -263,7 +297,7 @@ async function extractItemsFromPage( }); extractionResult.forEach((result: ExtractionResult, itemIndex: number) => { - if (result.hasExcludedUrl) return; + if (result.isExcluded) return; fieldStats.forEach((stat) => { stat.totalAttempts++; const fieldResult = result.fieldResults[stat.fieldName]; @@ -279,27 +313,27 @@ async function extractItemsFromPage( (result: ExtractionResult) => { let publishedDate: string | undefined; try { - publishedDate = result.item.date - ? parsePublishedDate(result.item.date) + publishedDate = result.values.date + ? parsePublishedDate(result.values.date) : undefined; } catch (error) { throw new Error( - `Date parsing failed for item "${result.item.title || result.item.url}": ${error instanceof Error ? error.message : "Unknown error"}`, + `Date parsing failed for item "${result.values.title || result.values.url}": ${error instanceof Error ? error.message : "Unknown error"}`, ); } return { - url: result.item.url || "", + url: result.values.url || "", crawledAt: new Date(), source: config.id, - title: result.item.title || "", - content: result.item.excerpt || "", - author: result.item.author || undefined, + title: result.values.title || "", + content: "", + author: result.values.author || undefined, publishedDate, metadata: { crawlerType: CRAWLER_TYPES.LISTING, configId: config.id, - extractedFields: Object.keys(result.item), + extractedFields: Object.keys(result.values), }, }; }, diff --git a/src/crawlers/handlers/PaginationHandler.ts b/src/crawlers/handlers/PaginationHandler.ts index 647652c..7b24d6a 100644 --- a/src/crawlers/handlers/PaginationHandler.ts +++ b/src/crawlers/handlers/PaginationHandler.ts @@ -2,10 +2,10 @@ import type { Page, TimeoutError } from "puppeteer"; import type { SourceConfig } from "@/core/types.js"; const PAGINATION_TIMEOUTS = { - NAVIGATION_MS: 5000, - CONTAINER_WAIT_MS: 20000, - CONTENT_LOAD_DELAY_MS: 1000, - RETRY_DELAY_MS: 15000, + NAVIGATION_SEC: 5, + CONTAINER_WAIT_SEC: 20, + CONTENT_LOAD_DELAY_SEC: 1, + RETRY_DELAY_SEC: 15, } as const; const PAGINATION_RETRY = { @@ -51,17 +51,20 @@ async function attemptPagination( try { await page.waitForNavigation({ waitUntil: "domcontentloaded", - timeout: PAGINATION_TIMEOUTS.NAVIGATION_MS, + timeout: PAGINATION_TIMEOUTS.NAVIGATION_SEC * 1000, }); } catch {} - await new Promise((resolve) => - setTimeout(resolve, PAGINATION_TIMEOUTS.CONTENT_LOAD_DELAY_MS), - ); + const waitTime = + config.listing.pagination?.delaySec ?? + PAGINATION_TIMEOUTS.CONTENT_LOAD_DELAY_SEC; + + console.log(`Waiting ${waitTime}sec`); + await new Promise((resolve) => setTimeout(resolve, waitTime * 1000)); try { - await page.waitForSelector(config.listing.items.container_selector, { - timeout: PAGINATION_TIMEOUTS.CONTAINER_WAIT_MS, + await page.waitForSelector(config.listing.container_selector, { + timeout: PAGINATION_TIMEOUTS.CONTAINER_WAIT_SEC * 1000, }); } catch (error) { console.warn( @@ -96,7 +99,7 @@ async function retryPagination( await page.reload(); await new Promise((resolve) => - setTimeout(resolve, PAGINATION_TIMEOUTS.RETRY_DELAY_MS), + setTimeout(resolve, PAGINATION_TIMEOUTS.RETRY_DELAY_SEC * 1000), ); } diff --git a/src/tests/__fixtures__/an/biden-digital-rights.ts b/src/tests/__fixtures__/an/biden-digital-rights.ts new file mode 100644 index 0000000..3dfbe8e --- /dev/null +++ b/src/tests/__fixtures__/an/biden-digital-rights.ts @@ -0,0 +1,53 @@ +export default `Six months ago, as the pandemic raged, Joe Biden and Kamala Harris became President and Vice President of the United States. On inauguration day, we launched the [U.S. Digital Rights in the Biden Era](https://www.accessnow.org/biden-era-tech-policy-tracker/) tracker to hold the administration and Congress accountable for defending human rights on tech policy topics ranging from data protection to disinformation to foreign policy. + +Here are some important updates on how the Administration has done its first six months in office across seven core issues: + +* * * + +#### **[➔ Pass a federal data protection law](https://www.accessnow.org/biden-era-tech-policy-tracker/#data-protection)** + +While no federal data protection law has passed, some progress has been made. Most notably, President Biden signed an [executive order](https://www.whitehouse.gov/briefing-room/presidential-actions/2021/07/09/executive-order-on-promoting-competition-in-the-american-economy/) that includes [calling on the Federal Trade Commission (FTC) to exercise their rulemaking authority](https://news.bloomberglaw.com/pharma-and-life-sciences/bidens-executive-order-links-data-collection-to-competition) on “unfair data collection and surveillance practices that may damage competition, consumer autonomy, and consumer privacy.” Unfortunately, the executive order makes no mention of discriminatory data practices or violations of civil rights, so we will continue to push for the inclusion of these issues in any FTC rulemaking. + +* * * + +#### [➔ Reinstate net neutrality](https://www.accessnow.org/biden-era-tech-policy-tracker/#net-neutrality) + +There has been no movement on net neutrality, particularly because the Federal Communications Commission (FCC) remains deadlocked at 2-2. It is still, however, on the administration’s mind given its recent [executive order](https://www.whitehouse.gov/briefing-room/presidential-actions/2021/07/09/executive-order-on-promoting-competition-in-the-american-economy/) that calls on the FCC to [restore net neutrality](https://www.theverge.com/2021/7/9/22569869/biden-executive-order-right-to-repair-isps-net-neutrality). + +* * * + +#### [➔ Close the digital divide and ensure connectivity](https://www.accessnow.org/biden-era-tech-policy-tracker/#connectivity) + +The FCC implemented the [Emergency Broadband Benefit](https://www.fcc.gov/broadbandbenefit) (to help people with low-incomes afford broadband) and the [Emergency Connectivity Fund](https://www.fcc.gov/emergency-connectivity-fund) (to help schools and libraries build connectivity). Relatedly, we helped [launch](https://www.accessnow.org/broadband-access-united-states/) the [Let’s Broadband Together](https://www.consumerreports.org/upload/broadband) campaign with Consumer Reports, which will look into how much broadband costs, a metric we have long asked the FCC to collect, to no avail. + +* * * + +#### [➔ Update Section 230](https://www.accessnow.org/biden-era-tech-policy-tracker/#230) + +No updates to Section 230 have passed Congress, but in a critical victory for the defense of free expression, Biden [repealed](https://www.theverge.com/2021/5/15/22437627/biden-revokes-trump-executive-order-section-230-twitter-facebook-google) Trump’s [executive order](https://www.accessnow.org/twitter-corrects-trumps-inaccurate-ballot-tweet-so-he-retaliates-with-legally-nonsensical-executive-order/) that asked federal agencies to retaliate against social media companies by changing Section 230. This decision helps ensure the Section 230 debate is left to Congress, where it belongs. + +* * * + +#### [➔ Address and combat disinformation](https://www.accessnow.org/biden-era-tech-policy-tracker/#disinformation) + +The U.S. Surgeon General published a [Confronting Health Misinformation](https://www.hhs.gov/sites/default/files/surgeon-general-misinformation-advisory.pdf) advisory that declares health misinformation a “serious threat” and calls on every sector of society to help limit its spread. To improve interagency coordination on this threat and other disinformation harms, we joined PEN America in calling on the White House to [create a designated task force](https://pen.org/press-release/coalition-urges-white-house-to-establish-disinformation-defense-and-free-expression-task-force/) to combat disinformation, but have yet to see the administration act. + +* * * + +#### [➔ Regulate facial recognition technology](https://www.accessnow.org/biden-era-tech-policy-tracker/#facial-recognition) + +Fortunately, we’ve seen some movement from the administration and Congress on facial recognition and biometric surveillance more broadly. The administration [rescinded](https://www.eff.org/deeplinks/2021/06/victory-biden-administration-rescinds-dangerous-proposed-rule-expand-biometrics) a Department of Homeland Security proposed rule to expand biometric collection of immigrants applying for benefits. And U.S. Senator Ed Markey introduced the [Facial Recognition and Biometric Technology Moratorium Act](https://www.markey.senate.gov/news/press-releases/senators-markey-and-merkley-and-reps-jayapal-pressley-to-introduce-legislation-to-ban-government-use-of-facial-recognition-other-biometric-technology), a bill that would end federal use of facial recognition tools and terminate federal funding for state and local law enforcement use of biometric tech. + +* * * + +#### [➔ Place human rights at the center of foreign policy](https://www.accessnow.org/biden-era-tech-policy-tracker/#foreign-policy) + +The new administration’s foreign policy on digital rights is moving in the right direction. We were invited to brief U.S. State Department officials on the [#KeepItOn campaign](https://www.accessnow.org/keepiton/) to combat internet shutdowns globally, and U.S. Secretary of State Blinken forcefully [condemned](https://cl.usembassy.gov/statement-by-secretary-of-state-antony-j-blinken-on-world-press-freedom-day/) internet shutdowns. And, we were excited to see that the State Department [plans to update](https://www.state.gov/10th-anniversary-of-the-un-guiding-principles-on-business-and-human-rights/) the [United States’ National Action Plan on Responsible Business Conduct](https://www.state.gov/u-s-national-contact-point-for-the-oecd-guidelines-for-multinational-enterprises/u-s-national-action-plan-on-responsible-business-conduct/) to better hold corporations accountable to the U.N. Guiding Principles on Business and Human Rights. + +_**Remember: Personnel is policy**_ + +The biggest barrier to the administration making more progress on tech policy is lack of political appointees. In addition to the FCC vacancy mentioned above, the National Telecommunications and Information Administration (the president’s technology policy agency) lacks a full-time director, and the lead antitrust position at the Department of Justice is still open. The FTC has been at full capacity for only one month of Biden’s tenure, and, despite Lina Khan’s recent confirmation as Chair, the FTC will again be missing a commissioner when Rohit Chopra moves to the Consumer Financial Protection Bureau. As long as these positions remain unfilled, President Biden will continue to lose precious time to move the needle on digital rights. + +All to say, Biden’s first six months have been a mixed bag on tech policy and human rights. He clearly has a desire to push the envelope, given his recent [executive order](https://www.whitehouse.gov/briefing-room/presidential-actions/2021/07/09/executive-order-on-promoting-competition-in-the-american-economy/), but he’s hobbled his own administration by dragging his feet on nominations. It’s time to get moving on all these issues. + +We’ll continue to track progress [here](https://www.accessnow.org/biden-era-tech-policy-tracker/) as we keep the pressure on. Be on the lookout for [updates from us](https://twitter.com/accessnow).`; diff --git a/src/tests/__fixtures__/an/kenya-sim-card-biometrics.ts b/src/tests/__fixtures__/an/kenya-sim-card-biometrics.ts new file mode 100644 index 0000000..a37deef --- /dev/null +++ b/src/tests/__fixtures__/an/kenya-sim-card-biometrics.ts @@ -0,0 +1,41 @@ +export default `“Avoid disconnection, update your SIM registration details._”_ This is the [message](https://www.airtelkenya.com/customer-registration) that Kenyans have been seeing for months. Kenyan telcos have been threatening to disconnect people from mobile phone and internet services if they do not provide new data, including facial images — part of their personal, unchangeable biometrics. The companies [claimed](https://twitter.com/Safaricom_Care/status/1463092258283085831) it was required under the [Kenya Information and Communications (Registration of SIM-Cards) Regulations, 2015](https://www.ca.go.ke/wp-content/uploads/2018/02/Registration-of-SIM-%E2%80%93Cards-Regulations-2015-1.pdf). In fact, that’s a gross misrepresentation of the law. Collecting biometric data for a SIM card puts people at risk of privacy violations, data breaches and abuse, and even identity theft. **Kenyans must say no, and oppose any new regulations to authorize it**. + +Privacy violations are not new in Kenya. As Access Now has previously [highlighted,](https://www.accessnow.org/wp-content/uploads/2021/10/Data-Protection-in-Kenya.pdf) companies like Safaricom have not only failed to protect subscribers’ personal information, but also refused to take accountability for data breaches. Here’s a look at what’s happened so far in Kenya, details on the push to authorize biometric data collection, and what Kenyans can do now to oppose it. + +###### What’s happened so far + +_If you can’t see the highlights below, please check your privacy-enhancing browser extensions. Open in desktop view for the best experience._ + +###### **What the law says** + +Regulation 5 (1) of The [Kenya Information and Communications(Registration of SIM-Cards) Regulations, 2015](https://www.ca.go.ke/wp-content/uploads/2018/02/Registration-of-SIM-%E2%80%93Cards-Regulations-2015-1.pdf) provides for the requirements of SIM registration. It does not require collection of biometric data. This means that the mandatory collection of photographs for SIM card re-registration exercise has no legal basis. It is a breach of people’s privacy, which is protected under Article 31 of the [Constitution of Kenya](http://www.kenyalaw.org:8181/exist/kenyalex/actview.xql?actid=Const2010), as well as the [Kenya Information and Communications (Consumer Protection) Regulations, 2010](https://www.ca.go.ke/wp-content/uploads/2018/02/Consumer-Protection-Regulations-2010-1.pdf), which specifically addresses privacy in the context of communications. + +The CAK which [originally directed](https://www.standardmedia.co.ke/business/news/article/2001439268/unregistered-phone-users-face-mass-switch-off) the collection eventually [rectified](https://www.youtube.com/watch?v=ioG7FicSapE) its wrongful interpretation. But the damage was already done. Many customers, fearing disconnection, have already disclosed their biometric information to the telcos. + +The [Kenya Data Protection Act, 2019](http://kenyalaw.org/kl/fileadmin/pdfdownloads/Acts/2019/TheDataProtectionAct__No24of2019.pdf) defines biometric data as sensitive personal data, and requires the data protection principles to be applied when processing it. The collection of facial biometrics goes against these principles. Yet the Office of the Data Protection Commissioner (ODPC) has been noticeably silent and seemingly uninterested in the breach of data subject rights throughout this process. + +###### **What’s happening now, and why it puts privacy and human rights at risk** + +The Ministry of ICT now plans to replace and revoke the Kenya Information and Communications (Registration of SlM-cards) Regulations, 2015 with new regulations that would authorize the collection of biometric data, the [draft Kenya Information and Communications (Registration of Telecommunications Service Subscribers) Regulations, 2022](https://ict.go.ke/wp-content/uploads/2022/05/Draft-Kenya-Information-and-Communication-Registration-of-telecommunications-service-subscribers-Regulations-2022.pdf). + +There is no logical and legal basis for the collection of biometric data as a prerequisite of SIM card registration. The [claims](https://youtu.be/6RW7C0fyqX8) telcos are making that facial biometrics will enhance security and prevent the commission of crimes are false. Indeed, as [the United Nations High Commissioner for Human Rights](https://documents-dds-ny.un.org/doc/UNDOC/GEN/G18/239/58/PDF/G1823958.pdf?OpenElement) has explained, the fact that biometrics are inextricably linked to a person’s identity makes it more difficult to recover in cases of data breach and identity theft. You can change a password, but you cannot change your face. + +Biometric databases also increase the risk of state surveillance, as the [Clearview AI](https://www.nytimes.com/2020/01/18/technology/clearview-privacy-facial-recognition.html) scandal demonstrates. In Kenya, law enforcement surveillance of people’s communications is already putting [human rights defenders](https://defenderscoalition.org/4912-2/) and others at risk. Authorizing mass biometric data collection makes it more likely it will fall into the wrong hands. + +###### **How Kenyans can fight biometric data collection and defend their rights** + +Kenya has data protection under the law. Kenya now needs data protection in practice. That means the Office of the Data Protection Commissioner must immediately take action to investigate the telcos’ breach of privacy and data subject rights, advise the Ministry of ICT against making the collection of biometric data a prerequisite of SIM card registration, and and offer remedy to the subscribers whose data have already been unlawfully collected. These data should be deleted. + +**Access Now and the Kenyan civil society are already pushing back:** + +* ARTICLE 19 Eastern Africa sent an Access to Information request to telcos to confirm the biometric data in their possession. Safaricom has so far failed to respond. +* Katiba Institute has [filed](https://www.the-star.co.ke/opinion/2022-06-06-huduma-bill-offers-safeguards-on-invasion-of-privacy/) a judicial review application at the High Court of Kenya seeking a number of legal requests, including an order compelling Safaricom to delete the unlawfully collected biometrics. +* Civil society and academia have submitted a [joint memorandum](https://www.kictanet.or.ke/joint-submission-on-draft-kica-regulations-2022/) to the Ministry of ICT calling for the deletion of the requirement for collection of biometric data from the draft subscriber registration regulations. + +**Now it’s your turn:** + +Here is what you can do to fight for your rights: + +* **Step 1: Submit an access to information request –** Section 26 (a) and (b) of the Data protection Act, 2019 gives you the right to access your data. (See Regulation 9 of the [_Data Protection (General) Regulations, 2021_](https://www.odpc.go.ke/download/data-protection-regulations/) for submission requirements.) +* **Step 2: Submit an erasure request** – Section 40 (1) (b) of the Data Protection Act, 2019 gives you the right to demand that your data be deleted if it was collected illegally. (See Regulation 12 of the [_Data Protection (General) Regulations, 2021_](https://www.odpc.go.ke/download/data-protection-regulations/) for submission requirements.) +* **Step 3: Demand the ODPC to act** – [Join our call demanding the Office of the Data Protection Commissioner](https://twitter.com/accessnow/status/1542515404680986629) to fulfill its mandate and protect the privacy of Kenyans.`; diff --git a/src/tests/__fixtures__/an/russias-record-war-on-connectivity.ts b/src/tests/__fixtures__/an/russias-record-war-on-connectivity.ts new file mode 100644 index 0000000..340b780 --- /dev/null +++ b/src/tests/__fixtures__/an/russias-record-war-on-connectivity.ts @@ -0,0 +1,27 @@ +export default `_Content note: This post refers to data on internet shutdown instances collated by trusted partners in Russia. To date, we have not yet verified these numbers using Access Now’s_ [**_Shutdown Tracker Optimization Project (STOP) methodology_**](https://www.accessnow.org/guide/shutdown-tracker-optimization-project/)_. As such, the data contained in this post may not correspond to future verified figures. You can find more information about how we track and measure global internet shutdowns_ [**_here_**](https://www.accessnow.org/guide/shutdown-tracker-optimization-project/#faq)_._ + +* * * + +Russia has recently been breaking its own [**records when it comes to cutting connectivity**](https://globalvoices.org/2025/01/22/shutting-down-the-net-the-growing-threat-of-russian-internet-censorship/) across the country. According to Russian internet monitoring project **[Na Svyazi](https://t.me/na_svyazi_helpdesk)**, authorities shut down the internet more [**than 650 times**](https://meduza.io/news/2025/07/01/na-svyazi-v-rossii-v-iyune-655-raz-otklyuchali-mobilnyy-internet) in June alone, [**most frequently**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts) in the cities of Nizhny Novgorod, Saratov, Tula, Omsk, and Rostov. Meanwhile in Moscow, internet shutdowns intensified during May’s [**Victory Day celebration**](https://www.accessnow.org/press-release/kremlin-must-end-internet-shutdowns-national-events/), with media outlet [**_Meduza_ reporting**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts) that the internet [**was jammed 17 times**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts) in the past two months. And now Russian authorities look set to tighten their iron grip over the internet even further, by potentially blocking WhatsApp across the country in a bid to push people onto highly surveilled, government-controlled messaging services instead. + +##### The daily impact of internet shutdowns + +These shutdowns drastically affect the day-to-day lives of everyone in Russia. People [**struggle**](https://novayagazeta.ru/articles/2025/06/19/stranu-otkliuchili) to make online or card payments, withdraw cash from certain ATMs, receive deliveries, access car sharing services, or use online navigation and map tools, among other things. In some cases, people must [**take a bus to the nearest town**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts) with a functioning ATM, simply to withdraw cash to pay for groceries, while people whose work heavily depends on the internet risk [**losing their incomes**](https://meduza.io/feature/2025/07/01/vse-uzhe-shutyat-o-internet-dali-kak-goryachuyu-vodu). + +Internet shutdowns also endanger people’s lives. According to independent media reports, [**shutdowns in the Udmurt Republic disabled an airstrike alarm**](https://novayagazeta.eu/articles/2025/07/08/gde-i-kak-v-rossii-otkliuchaiut-internet) at the Izhevsk Electromechanical Plant shortly before it was hit by a drone attack, resulting in the death of three people. Meanwhile, mobile internet disruptions mean that, in life-threatening situations, people have to waste precious time hunting for a signal to [**call for an ambulance**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts) or doctor. + +##### **Justifying an online iron curtain** + +Russian authorities frequently [**justify**](https://theins.ru/en/news/283119) cutting connectivity as necessary to combat drone attacks, such as those launched as part of the Ukrainian military’s June 2025 [****“**Spiderweb” operation**](https://meduza.io/en/feature/2025/06/03/the-impact-of-spiderweb). Yet an [**investigation by _The Insider_ and Na Svyazi**](https://theins.ru/en/news/282568) shows that fewer than half of all the shutdowns imposed since May 1 coincided with Ukraine’s drone attacks in the same regions, with disruptions recorded in many regions with no history of drone attacks or [**proximity to the frontlines**](https://meduza.io/en/feature/2025/07/07/mapping-russia-s-internet-blackouts), such as the Chukotka Autonomous Okrug. + +In an attempt to explain this discrepancy, some local authorities have [**claimed**](https://www.moscowtimes.ru/2025/06/17/v-kazhdom-tretem-regione-rossii-zaglushili-internet-a166310) that they are also forced to impose shutdowns due to law enforcement and telecommunications companies’ joint assessment of the “operational situation.” However, Russia’s track record on digital repression suggests that such authorities may be using both drone attacks and vague “operational” concerns as a pretext to restrict people in Russia from accessing information online or exercising their right to freedom of speech, especially via messaging apps. On [**June 30 alone, for example**](https://www.agents.media/v-odnovremennyh-sboyah-v-whatsapp-i-telegram-uvideli-podgotovku-k-blokirovke-servisov/), more than 12,000 reports of Telegram disruptions and nearly 1,500 reports of disruptions to WhatsApp were [**submitted**](https://www.agents.media/v-odnovremennyh-sboyah-v-whatsapp-i-telegram-uvideli-podgotovku-k-blokirovke-servisov/) to both Na Svyazi and connectivity monitoring service [**Downdetector**](https://downdetector.com/). Subsequently, on July 4, Telegram and WhatsApp [**were again throttled in Moscow**](https://t.me/zatelecom/30661) and elsewhere. + +Internet freedom advocates in Russia [**suggest**](https://www.dw.com/ru/drony-but-po-ekonomike-rf-daze-bez-vzryvov/a-73229974) that local officials may be cutting off mobile internet in an attempt to convince the Kremlin that they are taking measures against drones, despite the fact that mobile internet [**is not strictly needed**](https://visitukraine.today/blog/6346/communications-under-threat-will-ukraine-start-jamming-mobile-internet-due-to-drone-attacks#are-there-plans-to-shut-down-communications-what-the-government-is-saying) to operate drones. This weaponization and normalization of internet shutdowns reflect a wider trend, with [**conflict now the leading trigger for shutdowns worldwide**](https://www.accessnow.org/internet-shutdowns-2024/). + +##### What happens next? + +The Russian government continues to systematically isolate the country’s internet infrastructure and, therefore, the people of Russia from the rest of the world. In June 2025, Cloudflare [**confirmed**](https://blog.cloudflare.com/russian-internet-users-are-unable-to-access-the-open-internet/) that Russian authorities are preventing people in Russia from accessing websites and services that rely on its platform, by limiting the amount of content that can be served to a mere 16 KB, rendering many websites barely usable. This goes hand-in-hand with the ongoing [**throttling of YouTube**](https://meduza.io/en/news/2024/12/23/youtube-traffic-in-russia-drops-to-20-percent-of-pre-throttling-levels-platform-de-factoblocked-in-country), the [**deplatforming of VPNs**](https://www.techradar.com/vpn/vpn-privacy-security/a-new-wave-of-blocks-in-russia-targets-vpn-apps-and-cloudflare-subnets) from Apple’s App Store, and [**bans on Facebook and Instagram**](https://www.theguardian.com/world/2022/mar/21/russia-bans-facebook-and-instagram-under-extremism-law) — all measures instituted in an effort to prevent people from accessing independent information and resisting President Vladimir Putin’s propaganda. + +Now it seems that WhatsApp, one of the last remaining channels allowing for the free exchange of opinions, may be in danger. It has been suggested that the [**government plans**](https://meduza.io/en/feature/2025/07/18/there-s-a-99-percent-chance-it-will-happen) to block this app in a bid to [**push people to switch to an alternative messenger app known as Max**](https://www.rferl.org/a/russia-sovereign-internet-super-app-tech/33439093.html), which will be [**pre-installed on all smartphones sold in Russia**](https://cybernews.com/news/russia-to-create-state-run-messaging-app-whatsapp-telegram/), integrated with government services, and heavily controlled and surveilled by authorities. We’ve seen this playbook before, when Russian authorities [**designated**](https://www.bbc.com/news/technology-63218095) Meta as an extremist organization and [**throttled YouTube**](https://meduza.io/en/news/2024/12/23/youtube-traffic-in-russia-drops-to-20-percent-of-pre-throttling-levels-platform-de-factoblocked-in-country), then [**encouraged people**](https://rus.azattyq.org/a/31750439.html) to switch to Russian platforms like VK, Odnoklassniki, and [**RuTube**](https://www.codastory.com/newsletters/russia-rutube/). + +With [**digital authoritarianism**](https://www.accessnow.org/digital-dictatorship-and-resistance-in-eastern-europe-and-central-asia/) only deepening in Russia, it is vital to break this increasing and ever-more disturbing pattern of cutting connectivity. Russian authorities must stop hampering people’s access to an open, free, and secure internet, by ceasing all throttling measures and refraining from imposing any further internet shutdowns. At the same time, internet service providers should resist any orders that would stop them from providing high-quality, unrestricted internet access — a prerequisite for allowing the people of Russia to exercise their human rights to freedom of expression and information.`; diff --git a/src/tests/__fixtures__/an/vodafone-challenged-release-transparency-report.ts b/src/tests/__fixtures__/an/vodafone-challenged-release-transparency-report.ts new file mode 100644 index 0000000..88e9791 --- /dev/null +++ b/src/tests/__fixtures__/an/vodafone-challenged-release-transparency-report.ts @@ -0,0 +1,29 @@ +export default `NEW YORK — As more cell phone companies face increased pressure to release the data of their users, the privacy and safety of people everywhere is under greater threat. + +That’s why, at next Tuesday’s AGM, Access and FairPensions will ask Vodafone to release a transparency report. + +In the U.S. alone, cell phone companies received 1.3 million government requests for user data last year. If this was the number of requests for just one country, international companies like Vodafone, with 371 million subscribers worldwide in 70 countries, would be expected to receive many more. So far, it has been silent on how many requests it receives. + +“Last year, we asked what Vodafone would do to prevent another Egypt-type internet shutdown,” said Brett Solomon, Executive Director of Access, a global non-profit that stands for internet freedom. “In response, Vodafone made progress on privacy. This Tuesday, the focus shifts to transparency at home as well as abroad — will Vodafone tell customers how many law enforcement requests for their data it receives, and how it responds?” + +Government requests for user data are a hot topic. Corporate transparency reports have issued from Google and Twitter in the past year, and U.S. cellular companies told a congressman how many data requests they receive, and their policies for complying. The vast number of requests, often without judicial approval, surprised many observers. + +The pressure is on for Vodafone — 45% owner of US cell company Verizon Wireless, who received 260,000 requests for data in 2011 — to issue its own transparency report. Its policy for replying to government data requests is not currently known. + +A copy of the AGM question is below; Access will also provide its Telco Action Plan, a set of concrete measures Access created for telcos to better manage and remedy their human rights impacts. + +Question: + +“Mr. Chairman, + +My name is Louise Rouse. I am a shareholder. I ask this question on behalf of myself, AccessNow.org and the 10,000 Access supporters who endorsed this question. + +At last year’s meeting, Brett Solomon of Access asked what Vodafone was doing to ensure it did not face a repeat of the network shutdowns in Egypt in January 2011. We are pleased to note the actions taken by the company over the last year, including the privacy programme assessment carried out by PWC and Vodafone’s involvement in the Industry Dialogue. + +Today, I would like to ask a question about the company’s transparency on reporting requests for customer data by governments and their law enforcement agencies. The Telco Action Plan which Mr. Solomon gave you last year, and former Chairman Sir John Bond committed Vodafone to reviewing, clearly sets out how telecoms should respond to government requests for user data. This issue is fast becoming a public concern here and in all countries where Vodafone operates. + +In the US for example, a number of telecoms including Verizon Wireless (in which Vodafone holds 45% of the shares) were asked by a US Congressman to provide information on law enforcement requests for customer data. This month, the query found that US cellular providers received an extraordinary 1.3 million requests in 2011 alone, and complied with most. + +The Vodafone website states that “We have a clear policy to ensure we minimise the impact on our customers when interacting with governments and assisting law enforcement” but provides few details on the content of this policy. + +Will Vodafone proactively commit to providing a transparency report in the UK and globally, detailing how many government requests it receives for customer data, how many it complies with, and your policy for handing over such data, including how you vet those requests?”`; diff --git a/src/tests/crawlers/CrawlErrorManager.test.ts b/src/tests/crawlers/CrawlErrorManager.test.ts index 5b25509..abde57c 100644 --- a/src/tests/crawlers/CrawlErrorManager.test.ts +++ b/src/tests/crawlers/CrawlErrorManager.test.ts @@ -1,7 +1,6 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import { type CrawlErrorManager, - categorizeErrors, createCrawlErrorManager, } from "@/crawlers/CrawlErrorManager.js"; import type { MetadataStore } from "@/storage/MetadataStore.js"; @@ -58,36 +57,23 @@ describe("CrawlErrorManager", () => { expect(mockMetadataStore.addSessionErrors).not.toHaveBeenCalled(); }); - }); - describe("Error Categorization", () => { - it("should categorize field extraction warnings correctly", () => { - const warnings = [ - "Optional field 'author' not found for Item 1", - "Required field 'title' not found for Item 2", - "Content extraction failed for URL: example.com", - "Unknown extraction error", - ]; + it("should store errors using the generic addErrors method", () => { + const listingErrors = ["Listing error 1", "Listing error 2"]; + const contentErrors = ["Content error 1", "Content error 2"]; - errorManager.addFieldExtractionWarnings(warnings); + errorManager.addErrors("listing", listingErrors); + errorManager.addErrors("content", contentErrors); - // Should categorize listing vs content errors automatically expect(mockMetadataStore.addSessionErrors).toHaveBeenCalledWith( sessionId, "listing", - [ - "Optional field 'author' not found for Item 1", - "Required field 'title' not found for Item 2", - ], + listingErrors, ); - expect(mockMetadataStore.addSessionErrors).toHaveBeenCalledWith( sessionId, "content", - [ - "Content extraction failed for URL: example.com", - "Unknown extraction error", - ], + contentErrors, ); }); }); @@ -132,54 +118,30 @@ describe("CrawlErrorManager", () => { contentErrors: [], }); }); - }); -}); - -describe("categorizeErrors", () => { - it("should categorize listing errors correctly", () => { - const errors = [ - "Optional field 'author' not found", - "Required field 'title' not found", - "Item missing required fields: title, url", - "Item contained no extractable data", - ]; - const result = categorizeErrors(errors); - - expect(result.listingErrors).toEqual(errors); - expect(result.contentErrors).toEqual([]); - }); - - it("should categorize content errors correctly", () => { - const errors = [ - "Content extraction failed", - "Content page load timeout", - "Content parsing error occurred", - ]; + it("should handle session with missing metadata fields", () => { + const mockSession = { + sessionId: "test", + sourceId: "test", + sourceName: "test", + startTime: "test", + endTime: null, + metadata: JSON.stringify({ + // Intentionally missing listingErrors and contentErrors + }), + }; - const result = categorizeErrors(errors); + const getSessionMock = mockMetadataStore.getSession as ReturnType< + typeof vi.fn + >; + getSessionMock.mockReturnValue(mockSession); - expect(result.listingErrors).toEqual([]); - expect(result.contentErrors).toEqual(errors); - }); + const result = errorManager.getSessionErrors(); - it("should handle mixed error types", () => { - const errors = [ - "Required field 'title' not found", - "Content extraction failed", - "Optional field 'author' not found", - "Content parsing error", - ]; - - const result = categorizeErrors(errors); - - expect(result.listingErrors).toEqual([ - "Required field 'title' not found", - "Optional field 'author' not found", - ]); - expect(result.contentErrors).toEqual([ - "Content extraction failed", - "Content parsing error", - ]); + expect(result).toEqual({ + listingErrors: [], + contentErrors: [], + }); + }); }); }); diff --git a/src/tests/crawlers/MetadataTracker.basic.test.ts b/src/tests/crawlers/MetadataTracker.basic.test.ts index f13d616..b371ddf 100644 --- a/src/tests/crawlers/MetadataTracker.basic.test.ts +++ b/src/tests/crawlers/MetadataTracker.basic.test.ts @@ -52,23 +52,21 @@ describe("MetadataTracker - Basic Functionality", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".item", - fields: { - title: { selector: "h2", attribute: "text", optional: false }, - url: { selector: "a", attribute: "href", optional: false }, - publishedDate: { - selector: ".date", - attribute: "text", - optional: true, - }, + container_selector: ".item", + fields: { + title: { selector: "h2", attribute: "text", optional: false }, + url: { selector: "a", attribute: "href", optional: false }, + date: { + selector: ".date", + attribute: "text", + optional: true, }, }, }, content: { container_selector: ".article", fields: { - summary: { selector: ".summary", attribute: "text", optional: true }, + content: { selector: ".summary", attribute: "text", optional: true }, }, }, }; @@ -89,8 +87,8 @@ describe("MetadataTracker - Basic Functionality", () => { expect(metadata.totalFilteredItems).toBe(0); expect(metadata.pagesProcessed).toBe(0); expect(metadata.contentsCrawled).toBe(0); - expect(metadata.fieldStats).toHaveLength(3); // title, url, publishedDate - expect(metadata.contentFieldStats).toHaveLength(1); // summary + expect(metadata.fieldStats).toHaveLength(3); // title, url, date + expect(metadata.contentFieldStats).toHaveLength(1); // content expect(metadata.listingErrors).toEqual([]); expect(metadata.contentErrors).toEqual([]); }); @@ -168,8 +166,8 @@ describe("MetadataTracker - Basic Functionality", () => { ]); metadataTracker.addContentErrors(["Another content error"]); - const metadata = metadataTracker.getMetadata(); // Errors are now stored directly in database, not in memory + const metadata = metadataTracker.getMetadata(); expect(metadata.contentErrors).toEqual([]); // Verify database storage method was called correctly @@ -191,33 +189,6 @@ describe("MetadataTracker - Basic Functionality", () => { ); }); - it("should track field extraction warnings", () => { - metadataTracker.addFieldExtractionWarnings([ - "Optional field 'author' not found for \"Test Article\"", - "Content extraction warning: element not found", - ]); - - const metadata = metadataTracker.getMetadata(); - // Errors are now stored directly in database, not in memory - expect(metadata.listingErrors).toEqual([]); - expect(metadata.contentErrors).toEqual([]); - - // Verify database storage method was called correctly - expect(mockAddSessionErrors).toHaveBeenCalledTimes(2); // Once for listing, once for content - expect(mockAddSessionErrors).toHaveBeenNthCalledWith( - 1, - expect.any(String), - "listing", - ["Optional field 'author' not found for \"Test Article\""], - ); - expect(mockAddSessionErrors).toHaveBeenNthCalledWith( - 2, - expect.any(String), - "content", - ["Content extraction warning: element not found"], - ); - }); - it("should set stopped reason", () => { metadataTracker.setStoppedReason(StoppedReason.MAX_PAGES); diff --git a/src/tests/crawlers/MetadataTracker.items.test.ts b/src/tests/crawlers/MetadataTracker.items.test.ts index 2ec3dba..e1b2166 100644 --- a/src/tests/crawlers/MetadataTracker.items.test.ts +++ b/src/tests/crawlers/MetadataTracker.items.test.ts @@ -11,6 +11,7 @@ const mockGetSession = vi.fn(); const mockEndSession = vi.fn(); const mockCheckpoint = vi.fn(); const mockGetSessionContents = vi.fn(); +const mockAddSessionErrors = vi.fn(); const mockMetadataStore: Partial = { createSession: mockCreateSession, @@ -19,6 +20,7 @@ const mockMetadataStore: Partial = { endSession: mockEndSession, checkpoint: mockCheckpoint, getSessionContents: mockGetSessionContents, + addSessionErrors: mockAddSessionErrors, }; describe("MetadataTracker - Items Processing", () => { @@ -35,6 +37,7 @@ describe("MetadataTracker - Items Processing", () => { mockEndSession.mockClear(); mockCheckpoint.mockClear(); mockGetSessionContents.mockClear(); + mockAddSessionErrors.mockClear(); // Set up default return values mockGetSessionContents.mockReturnValue([]); @@ -46,23 +49,21 @@ describe("MetadataTracker - Items Processing", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".item", - fields: { - title: { selector: "h2", attribute: "text", optional: false }, - url: { selector: "a", attribute: "href", optional: false }, - publishedDate: { - selector: ".date", - attribute: "text", - optional: true, - }, + container_selector: ".item", + fields: { + title: { selector: "h2", attribute: "text", optional: false }, + url: { selector: "a", attribute: "href", optional: false }, + date: { + selector: ".date", + attribute: "text", + optional: true, }, }, }, content: { container_selector: ".article", fields: { - summary: { selector: ".summary", attribute: "text", optional: true }, + content: { selector: ".summary", attribute: "text", optional: true }, }, }, }; diff --git a/src/tests/crawlers/MetadataTracker.return-value.test.ts b/src/tests/crawlers/MetadataTracker.return-value.test.ts index 17be908..7b044bb 100644 --- a/src/tests/crawlers/MetadataTracker.return-value.test.ts +++ b/src/tests/crawlers/MetadataTracker.return-value.test.ts @@ -52,23 +52,21 @@ describe("MetadataTracker - Return Values", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".item", - fields: { - title: { selector: "h2", attribute: "text", optional: false }, - url: { selector: "a", attribute: "href", optional: false }, - publishedDate: { - selector: ".date", - attribute: "text", - optional: true, - }, + container_selector: ".item", + fields: { + title: { selector: "h2", attribute: "text", optional: false }, + url: { selector: "a", attribute: "href", optional: false }, + date: { + selector: ".date", + attribute: "text", + optional: true, }, }, }, content: { container_selector: ".article", fields: { - summary: { selector: ".summary", attribute: "text", optional: true }, + content: { selector: ".summary", attribute: "text", optional: true }, }, }, }; diff --git a/src/tests/crawlers/MetadataTracker.session.test.ts b/src/tests/crawlers/MetadataTracker.session.test.ts index fbf3393..15ed63b 100644 --- a/src/tests/crawlers/MetadataTracker.session.test.ts +++ b/src/tests/crawlers/MetadataTracker.session.test.ts @@ -14,6 +14,7 @@ const mockGetSession = vi.fn(); const mockEndSession = vi.fn(); const mockCheckpoint = vi.fn(); const mockGetSessionContents = vi.fn(); +const mockAddSessionErrors = vi.fn(); const mockMetadataStore: Partial = { createSession: mockCreateSession, @@ -22,6 +23,7 @@ const mockMetadataStore: Partial = { endSession: mockEndSession, checkpoint: mockCheckpoint, getSessionContents: mockGetSessionContents, + addSessionErrors: mockAddSessionErrors, }; describe("MetadataTracker - Session Management", () => { @@ -38,6 +40,7 @@ describe("MetadataTracker - Session Management", () => { mockEndSession.mockClear(); mockCheckpoint.mockClear(); mockGetSessionContents.mockClear(); + mockAddSessionErrors.mockClear(); // Set up default return values mockGetSessionContents.mockReturnValue([]); @@ -49,23 +52,21 @@ describe("MetadataTracker - Session Management", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".item", - fields: { - title: { selector: "h2", attribute: "text", optional: false }, - url: { selector: "a", attribute: "href", optional: false }, - publishedDate: { - selector: ".date", - attribute: "text", - optional: true, - }, + container_selector: ".item", + fields: { + title: { selector: "h2", attribute: "text", optional: false }, + url: { selector: "a", attribute: "href", optional: false }, + date: { + selector: ".date", + attribute: "text", + optional: true, }, }, }, content: { container_selector: ".article", fields: { - summary: { selector: ".summary", attribute: "text", optional: true }, + content: { selector: ".summary", attribute: "text", optional: true }, }, }, }; diff --git a/src/tests/crawlers/extractors/BrowserFieldExtractor.test.ts b/src/tests/crawlers/extractors/BrowserFieldExtractor.test.ts index 5cd67b8..b178bb2 100644 --- a/src/tests/crawlers/extractors/BrowserFieldExtractor.test.ts +++ b/src/tests/crawlers/extractors/BrowserFieldExtractor.test.ts @@ -154,42 +154,6 @@ describe("BrowserFieldExtractor", () => { ); }); - it("should handle optional missing field without error", () => { - const extractionFunction = createBrowserExtractionFunction(); - - // Mock browser context - container exists but optional field element doesn't - const mockContainer = { - querySelector: vi.fn().mockImplementation((selector: string) => { - return null; // Always return null for field elements - }), - }; - - global.document = { - querySelector: vi.fn().mockImplementation((selector: string) => { - if (selector === ".container") { - return mockContainer; - } - return null; - }), - } as any; - - const result = extractionFunction({ - container_selector: ".container", - fields: { - optional: { - selector: ".missing", - attribute: "text", - optional: true, - }, - }, - }); - - expect(result.results.optional).toBeNull(); - expect(result.extractionErrors).toContain( - "Optional field 'optional' not found: selector '.missing' returned no results", - ); - }); - it("should extract content from container element when selector is empty", () => { const extractionFunction = createBrowserExtractionFunction(); diff --git a/src/tests/crawlers/extractors/BrowserFieldExtractor.whitespace.test.ts b/src/tests/crawlers/extractors/BrowserFieldExtractor.whitespace.test.ts index a29da55..aa06d02 100644 --- a/src/tests/crawlers/extractors/BrowserFieldExtractor.whitespace.test.ts +++ b/src/tests/crawlers/extractors/BrowserFieldExtractor.whitespace.test.ts @@ -91,47 +91,5 @@ describe("BrowserFieldExtractor - Whitespace handling", () => { expect(result.results.content).toBe("Main content plus"); expect(result.extractionErrors).toHaveLength(0); }); - - it("should handle empty text after whitespace normalization as missing field", () => { - const extractionFunction = createBrowserExtractionFunction(); - - // Mock browser context with only whitespace - const mockElement = { - textContent: " \n \t \r ", - }; - - const mockContainer = { - querySelector: vi.fn().mockReturnValue(mockElement), - }; - - // Mocking global document - global.document = { - querySelector: vi.fn().mockImplementation((selector: string) => { - if (selector === ".container") { - return mockContainer; - } - return null; - }), - } as any; - - const result = extractionFunction({ - container_selector: ".container", - fields: { - content: { - selector: ".content", - attribute: "text", - optional: true, // Make it optional to avoid failing the test - }, - }, - }); - - // Should return null for text that becomes empty after normalization - expect(result.results.content).toBeNull(); - // Should have an extraction error for the missing field - expect(result.extractionErrors).toHaveLength(1); - expect(result.extractionErrors[0]).toContain( - "Optional field 'content' not found", - ); - }); }); }); diff --git a/src/tests/crawlers/extractors/ConcurrentContentExtractor.test.ts b/src/tests/crawlers/extractors/ConcurrentContentExtractor.test.ts index a280c83..c94719f 100644 --- a/src/tests/crawlers/extractors/ConcurrentContentExtractor.test.ts +++ b/src/tests/crawlers/extractors/ConcurrentContentExtractor.test.ts @@ -23,11 +23,9 @@ describe("ConcurrentContentExtractor", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".article", - fields: { - title: { selector: ".title", attribute: "text" }, - }, + container_selector: ".article", + fields: { + title: { selector: ".title", attribute: "text" }, }, }, content: { diff --git a/src/tests/crawlers/extractors/ContentDataMapper.test.ts b/src/tests/crawlers/extractors/ContentDataMapper.test.ts index 4364445..0e0f4e2 100644 --- a/src/tests/crawlers/extractors/ContentDataMapper.test.ts +++ b/src/tests/crawlers/extractors/ContentDataMapper.test.ts @@ -18,7 +18,7 @@ describe("ContentDataMapper", () => { const item: CrawledData = { url: "https://example.com/article", title: "Listing Title", - content: "Listing excerpt", + content: "Listing content", crawledAt: new Date(), source: "test", metadata: {}, @@ -44,7 +44,7 @@ describe("ContentDataMapper", () => { const item: CrawledData = { url: "https://example.com/article", title: "Listing Title", - content: "Listing excerpt", + content: "Listing content", author: "Existing Author", crawledAt: new Date(), source: "test", @@ -65,7 +65,7 @@ describe("ContentDataMapper", () => { expect(item.publishedDate).toBe("parsed-2023-01-01"); // Should preserve existing content and author - expect(item.content).toBe("Listing excerpt"); + expect(item.content).toBe("Listing content"); expect(item.author).toBe("Existing Author"); }); @@ -73,7 +73,7 @@ describe("ContentDataMapper", () => { const item: CrawledData = { url: "https://example.com/article", title: "Listing Title", - content: "Listing excerpt", + content: "Listing content", crawledAt: new Date(), source: "test", metadata: {}, @@ -85,7 +85,7 @@ describe("ContentDataMapper", () => { // Should not update fields with null/undefined values expect(item.title).toBe("Listing Title"); - expect(item.content).toBe("Listing excerpt"); + expect(item.content).toBe("Listing content"); expect(item.author).toBeUndefined(); expect(item.publishedDate).toBeUndefined(); }); @@ -94,7 +94,7 @@ describe("ContentDataMapper", () => { const item: CrawledData = { url: "https://example.com/article", title: "Listing Title", - content: "Listing excerpt", + content: "Listing content", crawledAt: new Date(), source: "test", metadata: {}, @@ -119,7 +119,7 @@ describe("ContentDataMapper", () => { const item: CrawledData = { url: "https://example.com/article", title: "Listing Title", - content: "Listing excerpt", + content: "Listing content", crawledAt: new Date(), source: "test", metadata: {}, diff --git a/src/tests/crawlers/extractors/ContentPageExtractor.test.ts b/src/tests/crawlers/extractors/ContentPageExtractor.test.ts index 02a8565..391f9bd 100644 --- a/src/tests/crawlers/extractors/ContentPageExtractor.test.ts +++ b/src/tests/crawlers/extractors/ContentPageExtractor.test.ts @@ -47,11 +47,9 @@ describe("ContentPageExtractor", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".article", - fields: { - title: { selector: ".title", attribute: "text" }, - }, + container_selector: ".article", + fields: { + title: { selector: ".title", attribute: "text" }, }, }, content: { diff --git a/src/tests/crawlers/extractors/ListingPageExtractor.test.ts b/src/tests/crawlers/extractors/ListingPageExtractor.test.ts index d88f14a..71d4bf5 100644 --- a/src/tests/crawlers/extractors/ListingPageExtractor.test.ts +++ b/src/tests/crawlers/extractors/ListingPageExtractor.test.ts @@ -11,13 +11,11 @@ describe("ListingPageExtractor", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".article", - fields: { - title: { selector: ".title", attribute: "text" }, - url: { selector: "a", attribute: "href" }, - author: { selector: ".author", attribute: "text", optional: true }, - }, + container_selector: ".article", + fields: { + title: { selector: ".title", attribute: "text" }, + url: { selector: "a", attribute: "href" }, + author: { selector: ".author", attribute: "text", optional: true }, }, }, content: { @@ -30,44 +28,51 @@ describe("ListingPageExtractor", () => { it("should extract items from page successfully", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: { + values: { title: "Test Article", url: "/article/1", author: "John Doe", + date: null, }, fieldResults: { title: { success: true, value: "Test Article" }, url: { success: true, value: "/article/1" }, author: { success: true, value: "John Doe" }, }, + isExcluded: false, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, ]), url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; - const fieldStats = [ + const fieldStats: FieldExtractionStats[] = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, { - fieldName: "url", + fieldName: "url" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, { - fieldName: "author", + fieldName: "author" as const, successCount: 0, totalAttempts: 0, isOptional: true, @@ -95,44 +100,61 @@ describe("ListingPageExtractor", () => { it("should filter items missing required fields", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { url: () => "https://test.com", evaluate: vi.fn().mockResolvedValue([ { - item: { title: "Complete Article", url: "/article/1" }, + values: { + title: "Complete Article", + url: "/article/1", + author: null, + date: null, + }, fieldResults: { title: { success: true, value: "Complete Article" }, url: { success: true, value: "/article/1" }, author: { success: false, value: null }, }, + isExcluded: false, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, { - item: { title: "Incomplete Article" }, + values: { + title: "Incomplete Article", + url: null, + author: null, + date: null, + }, fieldResults: { title: { success: true, value: "Incomplete Article" }, url: { success: false, value: null }, author: { success: false, value: null }, }, + isExcluded: false, hasRequiredFields: false, missingRequiredFields: ["url"], extractionErrors: [], }, ]), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; const fieldStats = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, { - fieldName: "url", + fieldName: "url" as const, successCount: 0, totalAttempts: 0, isOptional: false, @@ -157,8 +179,13 @@ describe("ListingPageExtractor", () => { it("should handle empty results gracefully", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { evaluate: vi.fn().mockResolvedValue([]), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; const fieldStats: FieldExtractionStats[] = []; @@ -176,15 +203,23 @@ describe("ListingPageExtractor", () => { it("should handle extraction errors and include them in filtered reasons", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: { title: "Error Article" }, + values: { + title: "Error Article", + url: null, + author: null, + date: null, + }, fieldResults: { title: { success: true, value: "Error Article" }, url: { success: false, value: null, error: "Selector not found" }, author: { success: false, value: null, error: "Element missing" }, }, + isExcluded: false, hasRequiredFields: false, missingRequiredFields: ["url"], extractionErrors: [ @@ -194,18 +229,21 @@ describe("ListingPageExtractor", () => { }, ]), url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; const fieldStats = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, { - fieldName: "url", + fieldName: "url" as const, successCount: 0, totalAttempts: 0, isOptional: false, @@ -232,16 +270,23 @@ describe("ListingPageExtractor", () => { it("should handle items with no extractable data", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: {}, + values: {}, fieldResults: {}, + isExcluded: false, hasRequiredFields: false, - missingRequiredFields: ["title", "url"], + missingRequiredFields: [], extractionErrors: [], }, ]), + url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; const fieldStats: FieldExtractionStats[] = []; @@ -261,44 +306,56 @@ describe("ListingPageExtractor", () => { it("should handle mixed success and failure cases", async () => { const extractor = createListingPageExtractor(); + + // Mock page with required methods const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: { title: "Good Article", url: "/article/1" }, + values: { + title: "Good Article", + url: "/article/1", + author: null, + date: null, + }, fieldResults: { title: { success: true, value: "Good Article" }, url: { success: true, value: "/article/1" }, author: { success: false, value: null }, }, + isExcluded: false, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, { - item: { title: "Bad Article" }, + values: { title: "Bad Article", url: null, author: null, date: null }, fieldResults: { title: { success: true, value: "Bad Article" }, url: { success: false, value: null, error: "Network error" }, author: { success: false, value: null }, }, + isExcluded: false, hasRequiredFields: false, missingRequiredFields: ["url"], extractionErrors: ["Field 'url' extraction failed: Network error"], }, ]), url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; - const fieldStats = [ + const fieldStats: FieldExtractionStats[] = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, { - fieldName: "url", + fieldName: "url" as const, successCount: 0, totalAttempts: 0, isOptional: false, @@ -321,196 +378,76 @@ describe("ListingPageExtractor", () => { ); }); - it("should exclude items with URLs matching exclusion patterns", async () => { + it("should handle items that are excluded by shouldExcludeItem function", async () => { const extractor = createListingPageExtractor(); - const mockConfigWithExcludes: SourceConfig = { + + // Mock config with shouldExcludeItem function + const mockConfigWithExclude: SourceConfig = { ...mockConfig, - content_url_excludes: ["/excluded/", "/category/"], + listing: { + ...mockConfig.listing, + shouldExcludeItem: vi.fn().mockImplementation((html, values) => { + return values?.title === "Excluded Article"; + }), + }, }; + // Mock page with required methods const mockPage = { url: () => "https://test.com", evaluate: vi.fn().mockResolvedValue([ { - item: { title: "Normal Article", url: "/article/1" }, + values: { + title: "Normal Article", + url: "/article/1", + author: null, + date: null, + }, fieldResults: { title: { success: true, value: "Normal Article" }, url: { success: true, value: "/article/1" }, }, - hasExcludedUrl: false, + isExcluded: false, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, { - item: { title: "Excluded Article", url: "/excluded/article/2" }, - fieldResults: { - title: { success: true, value: "Excluded Article" }, - url: { success: true, value: "/excluded/article/2" }, + values: { + title: "Excluded Article", + url: "/article/2", + author: null, + date: null, }, - hasExcludedUrl: true, - hasRequiredFields: true, - missingRequiredFields: [], - extractionErrors: [], - }, - { - item: { title: "Another Normal Article", url: "/article/3" }, fieldResults: { - title: { success: true, value: "Another Normal Article" }, - url: { success: true, value: "/article/3" }, + title: { success: true, value: "Excluded Article" }, + url: { success: true, value: "/article/2" }, }, - hasExcludedUrl: false, + isExcluded: true, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, ]), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; const fieldStats: FieldExtractionStats[] = []; const result = await extractor.extractItemsFromPage( mockPage, - mockConfigWithExcludes, + mockConfigWithExclude, fieldStats, 0, ); // Should only include non-excluded items - expect(result.items).toHaveLength(2); + expect(result.items).toHaveLength(1); expect(result.items[0].title).toBe("Normal Article"); - expect(result.items[1].title).toBe("Another Normal Article"); // Should track excluded URLs - expect(result.excludedUrls).toContain("/excluded/article/2"); - expect(result.filteredCount).toBe(1); - }); - - it("should handle items with no exclusion patterns", async () => { - const extractor = createListingPageExtractor(); - const mockConfigWithoutExcludes: SourceConfig = { - ...mockConfig, - content_url_excludes: undefined, - }; - - const mockPage = { - url: () => "https://test.com", - evaluate: vi.fn().mockResolvedValue([ - { - item: { title: "Article 1", url: "/article/1" }, - fieldResults: { - title: { success: true, value: "Article 1" }, - url: { success: true, value: "/article/1" }, - }, - hasExcludedUrl: false, - hasRequiredFields: true, - missingRequiredFields: [], - extractionErrors: [], - }, - { - item: { title: "Article 2", url: "/article/2" }, - fieldResults: { - title: { success: true, value: "Article 2" }, - url: { success: true, value: "/article/2" }, - }, - hasExcludedUrl: false, - hasRequiredFields: true, - missingRequiredFields: [], - extractionErrors: [], - }, - ]), - } as unknown as Page; - - const fieldStats: FieldExtractionStats[] = []; - const result = await extractor.extractItemsFromPage( - mockPage, - mockConfigWithoutExcludes, - fieldStats, - 0, - ); - - // Should include all items when no exclusion patterns - expect(result.items).toHaveLength(2); - expect(result.excludedUrls).toHaveLength(0); - expect(result.filteredCount).toBe(0); - }); - - it("should properly filter items when exclusion patterns is empty array", async () => { - const extractor = createListingPageExtractor(); - const mockConfigWithEmptyExcludes: SourceConfig = { - ...mockConfig, - content_url_excludes: [], - }; - - const mockPage = { - url: () => "https://test.com", - evaluate: vi.fn().mockResolvedValue([ - { - item: { title: "Article 1", url: "/article/1" }, - fieldResults: { - title: { success: true, value: "Article 1" }, - url: { success: true, value: "/article/1" }, - }, - hasExcludedUrl: false, - hasRequiredFields: true, - missingRequiredFields: [], - extractionErrors: [], - }, - ]), - } as unknown as Page; - - const fieldStats: FieldExtractionStats[] = []; - const result = await extractor.extractItemsFromPage( - mockPage, - mockConfigWithEmptyExcludes, - fieldStats, - 0, - ); - - // Should include all items when exclusion patterns is empty - expect(result.items).toHaveLength(1); - expect(result.excludedUrls).toHaveLength(0); - expect(result.filteredCount).toBe(0); - }); - - it("should add filtered reason for excluded items", async () => { - const extractor = createListingPageExtractor(); - const mockConfigWithExcludes: SourceConfig = { - ...mockConfig, - content_url_excludes: ["/excluded/"], - }; - - const mockPage = { - url: () => "https://test.com", - evaluate: vi.fn().mockResolvedValue([ - { - item: { title: "Excluded Article", url: "/excluded/article/1" }, - fieldResults: { - title: { success: true, value: "Excluded Article" }, - url: { success: true, value: "/excluded/article/1" }, - }, - hasExcludedUrl: true, - hasRequiredFields: true, - missingRequiredFields: [], - extractionErrors: [], - }, - ]), - } as unknown as Page; - - const fieldStats: FieldExtractionStats[] = []; - const result = await extractor.extractItemsFromPage( - mockPage, - mockConfigWithExcludes, - fieldStats, - 0, - ); - - // Should have no valid items but should track the exclusion - expect(result.items).toHaveLength(0); - expect(result.excludedUrls).toContain("/excluded/article/1"); + expect(result.excludedUrls).toContain("/article/2"); expect(result.filteredCount).toBe(1); - - // Should not add a specific filtered reason for excluded items - // (they are tracked separately in excludedUrls) - expect(result.filteredReasons).toHaveLength(0); }); }); diff --git a/src/tests/crawlers/extractors/ListingPageExtractor.whitespace.test.ts b/src/tests/crawlers/extractors/ListingPageExtractor.whitespace.test.ts index 86f0277..83ce7b5 100644 --- a/src/tests/crawlers/extractors/ListingPageExtractor.whitespace.test.ts +++ b/src/tests/crawlers/extractors/ListingPageExtractor.whitespace.test.ts @@ -11,12 +11,9 @@ describe("ListingPageExtractor - Whitespace handling", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".article", - fields: { - title: { selector: ".title", attribute: "text" }, - excerpt: { selector: ".excerpt", attribute: "text", optional: true }, - }, + container_selector: ".article", + fields: { + title: { selector: ".title", attribute: "text" }, }, }, content: { @@ -32,37 +29,32 @@ describe("ListingPageExtractor - Whitespace handling", () => { const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: { + values: { title: "Test Article With Spaces", // Already normalized by browser context - excerpt: "This is an excerpt", // Already normalized by browser context }, fieldResults: { title: { success: true, value: "Test Article With Spaces" }, - excerpt: { success: true, value: "This is an excerpt" }, }, + isExcluded: false, hasRequiredFields: true, missingRequiredFields: [], extractionErrors: [], }, ]), url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; - const fieldStats = [ + const fieldStats: FieldExtractionStats[] = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, - { - fieldName: "excerpt", - successCount: 0, - totalAttempts: 0, - isOptional: true, - missingItems: [], - }, ]; const result = await extractor.extractItemsFromPage( @@ -75,7 +67,6 @@ describe("ListingPageExtractor - Whitespace handling", () => { // Should have normalized the whitespace in the returned items expect(result.items).toHaveLength(1); expect(result.items[0].title).toBe("Test Article With Spaces"); - expect(result.items[0].content).toBe("This is an excerpt"); }); it("should handle empty text after whitespace normalization", async () => { @@ -83,37 +74,32 @@ describe("ListingPageExtractor - Whitespace handling", () => { const mockPage = { evaluate: vi.fn().mockResolvedValue([ { - item: { - title: "Valid Title", - excerpt: null, // Browser context would return null for empty/whitespace-only text + values: { + title: "", // Empty after normalization }, fieldResults: { - title: { success: true, value: "Valid Title" }, - excerpt: { success: false, value: null }, // Marked as unsuccessful + title: { success: false, value: null }, }, - hasRequiredFields: true, - missingRequiredFields: [], + isExcluded: false, + hasRequiredFields: false, + missingRequiredFields: ["title"], extractionErrors: [], }, ]), url: vi.fn().mockReturnValue("https://example.com"), + exposeFunction: vi.fn(), + removeExposedFunction: vi.fn(), + waitForSelector: vi.fn(), } as unknown as Page; - const fieldStats = [ + const fieldStats: FieldExtractionStats[] = [ { - fieldName: "title", + fieldName: "title" as const, successCount: 0, totalAttempts: 0, isOptional: false, missingItems: [], }, - { - fieldName: "excerpt", - successCount: 0, - totalAttempts: 0, - isOptional: true, - missingItems: [], - }, ]; const result = await extractor.extractItemsFromPage( @@ -124,9 +110,8 @@ describe("ListingPageExtractor - Whitespace handling", () => { ); // Should have handled empty text correctly - expect(result.items).toHaveLength(1); - expect(result.items[0].title).toBe("Valid Title"); - expect(result.items[0].content).toBe(""); // Empty string for excerpt + expect(result.items).toHaveLength(0); + expect(result.filteredCount).toBe(1); }); it("should normalize whitespace with exclusions", async () => { diff --git a/src/tests/crawlers/handlers/PaginationHandler.navigation.test.ts b/src/tests/crawlers/handlers/PaginationHandler.navigation.test.ts index 998ded8..1b107f3 100644 --- a/src/tests/crawlers/handlers/PaginationHandler.navigation.test.ts +++ b/src/tests/crawlers/handlers/PaginationHandler.navigation.test.ts @@ -20,11 +20,9 @@ describe("PaginationHandler - Navigation", () => { type: CRAWLER_TYPES.LISTING, listing: { url: "https://example.com", - items: { - container_selector: ".article", - fields: { - title: { selector: ".title", attribute: "text" }, - }, + container_selector: ".article", + fields: { + title: { selector: ".title", attribute: "text" }, }, pagination: { next_button_selector: ".next-page", diff --git a/src/tests/integration/an-integration.test.ts b/src/tests/integration/an-integration.test.ts new file mode 100644 index 0000000..dab7388 --- /dev/null +++ b/src/tests/integration/an-integration.test.ts @@ -0,0 +1,92 @@ +import { afterAll, beforeAll, describe, expect, it, vi } from "vitest"; +import { anSource as config } from "@/config/sources/an.js"; +import { createContentPageExtractor } from "@/crawlers/extractors/ContentPageExtractor"; +import { createListingPageExtractor } from "@/crawlers/extractors/ListingPageExtractor"; +import type { BrowserHandler } from "@/crawlers/handlers/BrowserHandler"; +import { createBrowserHandler } from "@/crawlers/handlers/BrowserHandler"; +import { navigateToNextPage } from "@/crawlers/handlers/PaginationHandler"; +import fixture4 from "@/tests/__fixtures__/an/biden-digital-rights"; +import fixture2 from "@/tests/__fixtures__/an/kenya-sim-card-biometrics"; +import fixture1 from "@/tests/__fixtures__/an/russias-record-war-on-connectivity"; +import fixture3 from "@/tests/__fixtures__/an/vodafone-challenged-release-transparency-report"; + +const ifDescribe = process.env.INT_TEST === "true" ? describe : describe.skip; + +ifDescribe("Access Now integration tests", () => { + let browser: BrowserHandler; + vi.setConfig({ testTimeout: 60000 }); + + beforeAll(async () => { + browser = await createBrowserHandler(config); + }); + + afterAll(async () => { + await browser.close(); + }); + + it("should crawl AN listing page", async () => { + const page = await browser.setupNewPage(config.listing.url); + const extractor = createListingPageExtractor(); + const result = await extractor.extractItemsFromPage(page, config, [], 0); + expect(result.items.length).toBeGreaterThan(0); + expect(result.items.every((item) => !!item.title)).toBeTruthy(); + expect(result.items.every((item) => !!item.url)).toBeTruthy(); + expect(result.items.every((item) => !!item.publishedDate)).toBeTruthy(); + }); + + it("should crawl to next AN listing page", async () => { + const page = await browser.setupNewPage(config.listing.url); + expect(await navigateToNextPage(page, config)).toBeTruthy(); + }); + + it("should crawl multiple AN content pages", async () => { + const testCases = [ + { + url: "https://www.accessnow.org/russias-record-war-on-connectivity/", + expectedTitle: "Russia’s record war on connectivity", + expectedAuthor: "Anastasiya", + expectedContent: fixture1, + }, + { + url: "https://www.accessnow.org/kenya-sim-card-biometrics/", + expectedTitle: + "Why Kenyans should say no to biometrics for SIM card registry", + expectedAuthor: "Bridget Jaimee Kokonya", + expectedContent: fixture2, + }, + { + url: "https://www.accessnow.org/vodafone-challenged-release-transparency-report/", + expectedTitle: "Vodafone Challenged to Release Transparency Report", + expectedAuthor: "Peter Micek", + expectedContent: fixture3, + }, + { + url: "https://www.accessnow.org/biden-digital-rights/", + expectedTitle: + "Six months in, Biden must speed progress on digital rights", + expectedAuthor: "Jennifer Brody Eric Null Peter Micek", + expectedContent: fixture4, + }, + ]; + + await Promise.all( + testCases.map(async (testCase) => { + const page = await browser.setupNewPage(testCase.url); + const extractor = createContentPageExtractor(browser); + const result = await extractor.extractFromContentPage( + browser, + page, + testCase.url, + config, + ); + + expect(result.contentData.title).toEqual(testCase.expectedTitle); + expect(result.contentData.content).toEqual(testCase.expectedContent); + expect(result.contentData.author).toEqual(testCase.expectedAuthor); + expect(result.errors.length).toBe(0); + + await page.close(); + }), + ); + }); +}); diff --git a/src/tests/ui/formatter.test.ts b/src/tests/ui/formatter.test.ts index a49a29f..aa7031f 100644 --- a/src/tests/ui/formatter.test.ts +++ b/src/tests/ui/formatter.test.ts @@ -82,7 +82,6 @@ describe("Data Formatter", () => { const result2 = formatDataForViewing(data, summary); const secondItemSection = result2.split("--- Item 2 of 2 ---")[1]; expect(secondItemSection).not.toContain("Author:"); - expect(secondItemSection).not.toContain("Excerpt:"); }); it("should format metadata as JSON and handle edge cases", () => { From da796178ef8d263b5cc1e76025989533c070ee18 Mon Sep 17 00:00:00 2001 From: Xavier Saliniere Date: Mon, 15 Sep 2025 16:14:28 -0400 Subject: [PATCH 2/2] test: fix integration tests --- src/tests/integration/eff-integration.test.ts | 1 - src/tests/integration/fpf-integration.test.ts | 1 - src/tests/integration/p2p-integration.test.ts | 1 - 3 files changed, 3 deletions(-) diff --git a/src/tests/integration/eff-integration.test.ts b/src/tests/integration/eff-integration.test.ts index c0dadf9..3bd2801 100644 --- a/src/tests/integration/eff-integration.test.ts +++ b/src/tests/integration/eff-integration.test.ts @@ -33,7 +33,6 @@ ifDescribe("Electronics Foundation integration tests", () => { expect(result.items.every((item) => !!item.title)).toBeTruthy(); expect(result.items.every((item) => !!item.url)).toBeTruthy(); expect(result.items.every((item) => !!item.publishedDate)).toBeTruthy(); - expect(result.items.every((item) => !!item.content)).toBeTruthy(); }); it("should crawl to next EFF listing page", async () => { diff --git a/src/tests/integration/fpf-integration.test.ts b/src/tests/integration/fpf-integration.test.ts index fff3830..686a61f 100644 --- a/src/tests/integration/fpf-integration.test.ts +++ b/src/tests/integration/fpf-integration.test.ts @@ -32,7 +32,6 @@ ifDescribe("Freedom Press integration tests", () => { expect(result.items.every((item) => !!item.title)).toBeTruthy(); expect(result.items.every((item) => !!item.url)).toBeTruthy(); expect(result.items.every((item) => !!item.publishedDate)).toBeTruthy(); - expect(result.items.every((item) => !!item.content)).toBeTruthy(); }); it("should crawl to next FPF listing page", async () => { diff --git a/src/tests/integration/p2p-integration.test.ts b/src/tests/integration/p2p-integration.test.ts index d4bdca8..766755c 100644 --- a/src/tests/integration/p2p-integration.test.ts +++ b/src/tests/integration/p2p-integration.test.ts @@ -32,7 +32,6 @@ ifDescribe("P2P Foundation integration tests", () => { expect(result.items.every((item) => !!item.title)).toBeTruthy(); expect(result.items.every((item) => !!item.url)).toBeTruthy(); expect(result.items.every((item) => !!item.publishedDate)).toBeTruthy(); - expect(result.items.every((item) => !!item.content)).toBeTruthy(); }); it("should crawl to next P2P listing page", async () => {