Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"maple_proc",
"maple_structures",
"~/rcs-utils",
"./newsscrapy"
],
"python.testing.unittestArgs": [
"-v",
Expand Down
4 changes: 4 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ install_packages(){

pip install scrapy-fake-useragent

pip install scrapy-playwright

playwright install chromium

pip install --upgrade pip setuptools

pip install python-socketio python-socketio[client]
Expand Down
18 changes: 11 additions & 7 deletions maple_chat/maple_chatgpt/chatgpt_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,19 @@ async def _fetch_pending_summaries(self):
while True:
self.logger.info('Fetching articles without chat_summary.')
try:
fetched_articles = []
for articles in self.maple_api.article_iterator(limit=1000, page=0):
for article in articles:
if not hasattr(article, 'chat_summary'):
self.maple_add_job(
sid=None,
api_key = self._chatgpt_api_key,
job_type = JobType.summary,
job_details = article.to_dict()
)
fetched_articles.append(article)
self.logger.info('Fetched %d articles.', len(fetched_articles))
for article in reversed(fetched_articles):
if not hasattr(article, 'chat_summary'):
self.maple_add_job(
sid=None,
api_key = self._chatgpt_api_key,
job_type = JobType.summary,
job_details = article.to_dict()
)
except Exception as exc:
self.logger.error('Failed fetch_pending_summaries. %s', exc)
sec = rcs.utils.time_to_midnight()
Expand Down
2 changes: 1 addition & 1 deletion maple_proc/maple_processing/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def _fetch_training_data(self):
flag_message = True
if article_hours > self._max_hours:
raise ValueError(
"Number of hours used for trained reached max_hours without having enough articles for training")
f"Number of hours used for trained reached max_hours without having enough articles for training. Only {len(self._training_data)} articles retrieved (required: {self._article_train_min_size}).")
self.logger.info('Retrieved %d articles', len(self._training_data))

def _extract_chat_summaries(self, data: list[Article]):
Expand Down
12 changes: 8 additions & 4 deletions maple_structures/maple_structures/maple.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,10 @@ def about(self, value):
setattr(self, "_about", value)

def to_dict(self):
return self._to_dict_endpoint()
author = self._to_dict_endpoint()
if author['email'] == '':
author.pop('email')
return author

@staticmethod
def from_json(data):
Expand Down Expand Up @@ -247,7 +250,7 @@ def to_dict(self):
dict(name="title", type=str, default=""),
dict(name="summary", type=str, default=""),
dict(name="content", type=str, default=""),
dict(name="author", type=list, default={},
dict(name="author", type=list, default=[],
secondary_type=Author, validator=Author.validate),
dict(name="video_url", type=list, default=[],
secondary_type=str, validator=validators.url),
Expand Down Expand Up @@ -305,8 +308,9 @@ def add_author(self, author: Author):
)
if getattr(self, '_author', []) is None:
setattr(self, '_author', [])
setattr(self, '_author', getattr(self, '_author', []).append(author))
# self._author.append(author)
authors = getattr(self, '_author', [])
authors.append(author)
setattr(self, '_author', authors)

def to_dict(self, *, suppress_null=True):
'''converts Article to dictionary'''
Expand Down
204 changes: 204 additions & 0 deletions newsscrapy/newsscrapy/spiders/CTVNewsV2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@


import os
import glob
from typing import Union
import re
import json
import logging

from scrapy.spiders import Spider
from scrapy.http import HtmlResponse
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from playwright.async_api import async_playwright

from maple_structures import Article, Author

logger = logging.getLogger("CTVNewsSpider")

class CTVNewsParser:
@staticmethod
def parse_article(response: HtmlResponse) -> Union[Article, None]:
article = None
try:
article = CTVNewsParser.parse_from_json(response)
except json.JSONDecodeError as e:
logger.warning('Error parsing article from JSON: %s', e)

if article is None:
try:
article = CTVNewsParser.parse_from_html(response)
except NotImplementedError as e:
logger.warning('HTML parsing not implemented: %s', e)
except Exception as e:
logger.warning('Error parsing article from HTML: %s', e)

return article


@staticmethod
def parse_from_json(response: HtmlResponse) -> Union[Article, None]:
js = response.xpath('//script[@id="fusion-metadata"]//text()').extract_first()

if js:
match = re.search(r'Fusion\.globalContent\s*=\s*({.*?});', js, re.DOTALL)
if match:
json_str = match.group(1)
data = json.loads(json_str)
article = Article()
article.url = response.url

# Title
if 'headlines' in data:
if 'basic' in data['headlines']:
article.title = data['headlines']['basic']
# Content
if 'content_elements' in data:
if isinstance(data['content_elements'], list):
content = article.title
for element in data['content_elements']:
if 'type' in element and element['type'] == 'text':
if 'content' in element:
content += '\n' + element['content']
elif 'type' in element and element['type'] == 'header':
if 'content' in element:
content += '\n' + element['content'] + '\n'
else:
if 'content' in element:
content += ' ' + element['content']
article.content = content
# Date Published
if 'first_publish_date' in data:
article.date_published = data['first_publish_date']
elif 'publish_date' in data:
article.date_published = data['publish_date']
if 'last_updated_date' in data:
article.date_modified = data['last_updated_date']

if 'language' in data:
article.language = data['language']

# Authors
if 'credits' in data:
if 'by' in data['credits']:
for author in data['credits']['by']:
new_author = Author()
if 'additional_properties' in author:
if 'original' in author['additional_properties']:
original = author['additional_properties']['original']
name = ''
if 'firstName' in original:
name += original['firstName']
if 'lastName' in original:
name += ' ' + original['lastName']
new_author.name = name

if 'email' in original:
new_author.email = original['email']
else:
new_author.email = None
if 'bio_page' in original:
new_author.url = response.urljoin(original['bio_page'])

# if best format is not found, try to parse from other fields
elif 'type' in author and author['type'] == 'author':
if 'name' in author:
new_author.name = author['name']
if 'url' in author:
new_author.url = response.urljoin(author['url'])
if 'social_links' in author:
if 'url' in author['social_links']:
if 'site' in author['social_links'] and author['social_links']['site'] == 'email':
new_author.email = author['social_links']['url']
if new_author.name != '':
article.add_author(new_author)
return article
raise ValueError('Article not found in JSON')

@staticmethod
def parse_from_html(response: HtmlResponse) -> Union[Article, None]:
raise NotImplementedError()


class CTVNewsSpider(Spider):
name = 'CTVNewsSpider'
sample_file_location = None

def __init__(self, on_article_content: callable = None, **kwargs):
super().__init__(self.name, **kwargs)
self.start_urls = ['https://www.ctvnews.ca']
self.on_article_content = on_article_content
self.visited = []
self.count=0
if self.sample_file_location is not None:
os.makedirs(self.sample_file_location, exist_ok=True)

async def parse(self, response):
if response.url in self.visited:
yield None
self.visited.append(response.url)
for href in response.css('a::attr(href)').getall():
if self.start_urls[0] in href or href.startswith('/'):
if '/article/' in href:
yield await self.parse_article(response.urljoin(href))
else:
yield response.follow(href, self.parse)
else:
print(f'Invalid external link: {href}')

async def parse_article(self, url):
article = None
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

await page.goto(url) # , wait_until='networkidle'

content = await page.content()
htmlresponse = HtmlResponse(url=page.url, body=content, encoding='utf-8')

article = CTVNewsParser.parse_article(htmlresponse)

if self.sample_file_location is not None:
# Store screenshot
await page.screenshot(
path=os.path.join(self.sample_file_location, f'{self.count}-screen.png'),
full_page=True)

# Store article JSON
json.dump(article.to_dict(), open(os.path.join(self.sample_file_location, f'{self.count}-article.json'), 'w', encoding='utf-8'), indent=4)

# Store HTML content
html = htmlresponse.body.decode('utf-8')
with open(
os.path.join(self.sample_file_location,f'{self.count}-contenthtml.html'),
'w',
encoding='utf-8') as file:
file.write(html)

await browser.close()
self.count += 1

return article.to_dict()


if __name__ == "__main__":

SAMPLE_FOLDER = 'sample3'

# cleanup sample folder.
files = glob.glob(os.path.join(SAMPLE_FOLDER, '*'))
for f in files:
os.remove(f)

settings = get_project_settings()
settings['PLAYWRIGHT_BROWSER_TYPE'] = 'chromium'
settings['PLAYWRIGHT_LAUNCH_OPTIONS'] = {'headless': True}
settings['ROBOTSTXT_OBEY'] = True

process = CrawlerProcess(settings=settings)
CTVNewsSpider.sample_file_location = SAMPLE_FOLDER
process.crawl(CTVNewsSpider)
process.start()
6 changes: 3 additions & 3 deletions runtime_scripts/data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@

sys.path.append(os.path.join(os.path.abspath(""), "newsscrapy"))
sys.path.append(os.path.join(os.path.abspath(""), "../newsscrapy"))
from newsscrapy.spiders import scrapyCBC, scrapyCTVNews
from newsscrapy.spiders import scrapyCBC, scrapyCTVNews, CTVNewsV2

print(sys.path)





scrapy.utils.reactor.install_reactor(
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
)
Expand Down Expand Up @@ -64,7 +63,8 @@

spiders_ = [
scrapyCBC.CBCHeadlinesSpider,
scrapyCTVNews.CTVNewsSpider,
# scrapyCTVNews.CTVNewsSpider,
CTVNewsV2.CTVNewsSpider,
]


Expand Down