diff --git a/.vscode/settings.json b/.vscode/settings.json index 5ad5378..b0f715c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,6 +12,7 @@ "maple_proc", "maple_structures", "~/rcs-utils", + "./newsscrapy" ], "python.testing.unittestArgs": [ "-v", diff --git a/install.sh b/install.sh index e98f4dc..46595d3 100755 --- a/install.sh +++ b/install.sh @@ -26,6 +26,10 @@ install_packages(){ pip install scrapy-fake-useragent + pip install scrapy-playwright + + playwright install chromium + pip install --upgrade pip setuptools pip install python-socketio python-socketio[client] diff --git a/maple_chat/maple_chatgpt/chatgpt_server.py b/maple_chat/maple_chatgpt/chatgpt_server.py index 74458b0..ffce6a2 100644 --- a/maple_chat/maple_chatgpt/chatgpt_server.py +++ b/maple_chat/maple_chatgpt/chatgpt_server.py @@ -180,15 +180,19 @@ async def _fetch_pending_summaries(self): while True: self.logger.info('Fetching articles without chat_summary.') try: + fetched_articles = [] for articles in self.maple_api.article_iterator(limit=1000, page=0): for article in articles: - if not hasattr(article, 'chat_summary'): - self.maple_add_job( - sid=None, - api_key = self._chatgpt_api_key, - job_type = JobType.summary, - job_details = article.to_dict() - ) + fetched_articles.append(article) + self.logger.info('Fetched %d articles.', len(fetched_articles)) + for article in reversed(fetched_articles): + if not hasattr(article, 'chat_summary'): + self.maple_add_job( + sid=None, + api_key = self._chatgpt_api_key, + job_type = JobType.summary, + job_details = article.to_dict() + ) except Exception as exc: self.logger.error('Failed fetch_pending_summaries. %s', exc) sec = rcs.utils.time_to_midnight() diff --git a/maple_proc/maple_processing/processing.py b/maple_proc/maple_processing/processing.py index 25e18eb..5f72b70 100644 --- a/maple_proc/maple_processing/processing.py +++ b/maple_proc/maple_processing/processing.py @@ -314,7 +314,7 @@ def _fetch_training_data(self): flag_message = True if article_hours > self._max_hours: raise ValueError( - "Number of hours used for trained reached max_hours without having enough articles for training") + f"Number of hours used for trained reached max_hours without having enough articles for training. Only {len(self._training_data)} articles retrieved (required: {self._article_train_min_size}).") self.logger.info('Retrieved %d articles', len(self._training_data)) def _extract_chat_summaries(self, data: list[Article]): diff --git a/maple_structures/maple_structures/maple.py b/maple_structures/maple_structures/maple.py index 595e7f9..c024d4b 100644 --- a/maple_structures/maple_structures/maple.py +++ b/maple_structures/maple_structures/maple.py @@ -167,7 +167,10 @@ def about(self, value): setattr(self, "_about", value) def to_dict(self): - return self._to_dict_endpoint() + author = self._to_dict_endpoint() + if author['email'] == '': + author.pop('email') + return author @staticmethod def from_json(data): @@ -247,7 +250,7 @@ def to_dict(self): dict(name="title", type=str, default=""), dict(name="summary", type=str, default=""), dict(name="content", type=str, default=""), - dict(name="author", type=list, default={}, + dict(name="author", type=list, default=[], secondary_type=Author, validator=Author.validate), dict(name="video_url", type=list, default=[], secondary_type=str, validator=validators.url), @@ -305,8 +308,9 @@ def add_author(self, author: Author): ) if getattr(self, '_author', []) is None: setattr(self, '_author', []) - setattr(self, '_author', getattr(self, '_author', []).append(author)) - # self._author.append(author) + authors = getattr(self, '_author', []) + authors.append(author) + setattr(self, '_author', authors) def to_dict(self, *, suppress_null=True): '''converts Article to dictionary''' diff --git a/newsscrapy/newsscrapy/spiders/CTVNewsV2.py b/newsscrapy/newsscrapy/spiders/CTVNewsV2.py new file mode 100644 index 0000000..f04dc85 --- /dev/null +++ b/newsscrapy/newsscrapy/spiders/CTVNewsV2.py @@ -0,0 +1,204 @@ + + +import os +import glob +from typing import Union +import re +import json +import logging + +from scrapy.spiders import Spider +from scrapy.http import HtmlResponse +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from playwright.async_api import async_playwright + +from maple_structures import Article, Author + +logger = logging.getLogger("CTVNewsSpider") + +class CTVNewsParser: + @staticmethod + def parse_article(response: HtmlResponse) -> Union[Article, None]: + article = None + try: + article = CTVNewsParser.parse_from_json(response) + except json.JSONDecodeError as e: + logger.warning('Error parsing article from JSON: %s', e) + + if article is None: + try: + article = CTVNewsParser.parse_from_html(response) + except NotImplementedError as e: + logger.warning('HTML parsing not implemented: %s', e) + except Exception as e: + logger.warning('Error parsing article from HTML: %s', e) + + return article + + + @staticmethod + def parse_from_json(response: HtmlResponse) -> Union[Article, None]: + js = response.xpath('//script[@id="fusion-metadata"]//text()').extract_first() + + if js: + match = re.search(r'Fusion\.globalContent\s*=\s*({.*?});', js, re.DOTALL) + if match: + json_str = match.group(1) + data = json.loads(json_str) + article = Article() + article.url = response.url + + # Title + if 'headlines' in data: + if 'basic' in data['headlines']: + article.title = data['headlines']['basic'] + # Content + if 'content_elements' in data: + if isinstance(data['content_elements'], list): + content = article.title + for element in data['content_elements']: + if 'type' in element and element['type'] == 'text': + if 'content' in element: + content += '\n' + element['content'] + elif 'type' in element and element['type'] == 'header': + if 'content' in element: + content += '\n' + element['content'] + '\n' + else: + if 'content' in element: + content += ' ' + element['content'] + article.content = content + # Date Published + if 'first_publish_date' in data: + article.date_published = data['first_publish_date'] + elif 'publish_date' in data: + article.date_published = data['publish_date'] + if 'last_updated_date' in data: + article.date_modified = data['last_updated_date'] + + if 'language' in data: + article.language = data['language'] + + # Authors + if 'credits' in data: + if 'by' in data['credits']: + for author in data['credits']['by']: + new_author = Author() + if 'additional_properties' in author: + if 'original' in author['additional_properties']: + original = author['additional_properties']['original'] + name = '' + if 'firstName' in original: + name += original['firstName'] + if 'lastName' in original: + name += ' ' + original['lastName'] + new_author.name = name + + if 'email' in original: + new_author.email = original['email'] + else: + new_author.email = None + if 'bio_page' in original: + new_author.url = response.urljoin(original['bio_page']) + + # if best format is not found, try to parse from other fields + elif 'type' in author and author['type'] == 'author': + if 'name' in author: + new_author.name = author['name'] + if 'url' in author: + new_author.url = response.urljoin(author['url']) + if 'social_links' in author: + if 'url' in author['social_links']: + if 'site' in author['social_links'] and author['social_links']['site'] == 'email': + new_author.email = author['social_links']['url'] + if new_author.name != '': + article.add_author(new_author) + return article + raise ValueError('Article not found in JSON') + + @staticmethod + def parse_from_html(response: HtmlResponse) -> Union[Article, None]: + raise NotImplementedError() + + +class CTVNewsSpider(Spider): + name = 'CTVNewsSpider' + sample_file_location = None + + def __init__(self, on_article_content: callable = None, **kwargs): + super().__init__(self.name, **kwargs) + self.start_urls = ['https://www.ctvnews.ca'] + self.on_article_content = on_article_content + self.visited = [] + self.count=0 + if self.sample_file_location is not None: + os.makedirs(self.sample_file_location, exist_ok=True) + + async def parse(self, response): + if response.url in self.visited: + yield None + self.visited.append(response.url) + for href in response.css('a::attr(href)').getall(): + if self.start_urls[0] in href or href.startswith('/'): + if '/article/' in href: + yield await self.parse_article(response.urljoin(href)) + else: + yield response.follow(href, self.parse) + else: + print(f'Invalid external link: {href}') + + async def parse_article(self, url): + article = None + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + await page.goto(url) # , wait_until='networkidle' + + content = await page.content() + htmlresponse = HtmlResponse(url=page.url, body=content, encoding='utf-8') + + article = CTVNewsParser.parse_article(htmlresponse) + + if self.sample_file_location is not None: + # Store screenshot + await page.screenshot( + path=os.path.join(self.sample_file_location, f'{self.count}-screen.png'), + full_page=True) + + # Store article JSON + json.dump(article.to_dict(), open(os.path.join(self.sample_file_location, f'{self.count}-article.json'), 'w', encoding='utf-8'), indent=4) + + # Store HTML content + html = htmlresponse.body.decode('utf-8') + with open( + os.path.join(self.sample_file_location,f'{self.count}-contenthtml.html'), + 'w', + encoding='utf-8') as file: + file.write(html) + + await browser.close() + self.count += 1 + + return article.to_dict() + + +if __name__ == "__main__": + + SAMPLE_FOLDER = 'sample3' + + # cleanup sample folder. + files = glob.glob(os.path.join(SAMPLE_FOLDER, '*')) + for f in files: + os.remove(f) + + settings = get_project_settings() + settings['PLAYWRIGHT_BROWSER_TYPE'] = 'chromium' + settings['PLAYWRIGHT_LAUNCH_OPTIONS'] = {'headless': True} + settings['ROBOTSTXT_OBEY'] = True + + process = CrawlerProcess(settings=settings) + CTVNewsSpider.sample_file_location = SAMPLE_FOLDER + process.crawl(CTVNewsSpider) + process.start() \ No newline at end of file diff --git a/runtime_scripts/data_fetcher.py b/runtime_scripts/data_fetcher.py index bba29bd..a94e245 100644 --- a/runtime_scripts/data_fetcher.py +++ b/runtime_scripts/data_fetcher.py @@ -16,14 +16,13 @@ sys.path.append(os.path.join(os.path.abspath(""), "newsscrapy")) sys.path.append(os.path.join(os.path.abspath(""), "../newsscrapy")) -from newsscrapy.spiders import scrapyCBC, scrapyCTVNews +from newsscrapy.spiders import scrapyCBC, scrapyCTVNews, CTVNewsV2 print(sys.path) - scrapy.utils.reactor.install_reactor( "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ) @@ -64,7 +63,8 @@ spiders_ = [ scrapyCBC.CBCHeadlinesSpider, - scrapyCTVNews.CTVNewsSpider, + # scrapyCTVNews.CTVNewsSpider, + CTVNewsV2.CTVNewsSpider, ]