Skip to content

Commit 4c5e410

Browse files
jeremymanningclaude
andcommitted
Add publications build system with Excel data source
- Create templates/publications.html with content markers - Extract 104 publications from HTML to data/publications.xlsx - Implement scripts/build_publications.py to generate HTML - Add 13 comprehensive tests for build_publications.py - Include extraction script for one-time data migration The build system now generates publications.html from: - Excel spreadsheet (data/publications.xlsx) with 6 sheets - Template file (templates/publications.html) with markers All 44 tests pass (31 utils + 13 publications). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 4d3dba0 commit 4c5e410

File tree

6 files changed

+861
-68
lines changed

6 files changed

+861
-68
lines changed

data/publications.xlsx

21.6 KB
Binary file not shown.

publications.html

Lines changed: 68 additions & 68 deletions
Large diffs are not rendered by default.

scripts/build_publications.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/usr/bin/env python3
2+
"""Build publications.html from spreadsheet data.
3+
4+
Reads data from data/publications.xlsx and generates publications.html
5+
using the template in templates/publications.html.
6+
"""
7+
from pathlib import Path
8+
from typing import List, Dict, Any
9+
import openpyxl
10+
11+
from utils import inject_content
12+
13+
14+
def load_publications(xlsx_path: Path) -> Dict[str, List[Dict[str, Any]]]:
15+
"""Load all publication data from Excel spreadsheet.
16+
17+
Args:
18+
xlsx_path: Path to the publications.xlsx file
19+
20+
Returns:
21+
Dictionary with keys for each section (papers, chapters, etc.)
22+
Each value is a list of publication dictionaries.
23+
"""
24+
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
25+
26+
data = {}
27+
for sheet_name in wb.sheetnames:
28+
sheet = wb[sheet_name]
29+
30+
# Get headers from first row
31+
headers = [cell.value for cell in sheet[1]]
32+
33+
rows = []
34+
for row in sheet.iter_rows(min_row=2, values_only=True):
35+
# Skip empty rows
36+
if not any(cell is not None for cell in row):
37+
continue
38+
39+
row_dict = {}
40+
for header, value in zip(headers, row):
41+
if value is None:
42+
row_dict[header] = ''
43+
else:
44+
row_dict[header] = value
45+
rows.append(row_dict)
46+
47+
data[sheet_name] = rows
48+
49+
wb.close()
50+
return data
51+
52+
53+
def generate_publication_card(pub: Dict[str, Any]) -> str:
54+
"""Generate HTML for a single publication card.
55+
56+
Args:
57+
pub: Dictionary with publication data (image, title, title_url, citation, links_html)
58+
59+
Returns:
60+
HTML string for the publication card
61+
"""
62+
image = pub.get('image', '')
63+
title = pub.get('title', '')
64+
title_url = pub.get('title_url', '')
65+
citation = pub.get('citation', '')
66+
links_html = pub.get('links_html', '')
67+
68+
# Build image path - prepend directory if not empty
69+
if image:
70+
image_src = f"images/publications/{image}"
71+
else:
72+
image_src = ""
73+
74+
# Build title HTML
75+
if title_url:
76+
title_html = f'<a href="{title_url}" target="_blank">{title}</a>'
77+
else:
78+
title_html = title
79+
80+
# Build links paragraph if we have links
81+
links_p = ''
82+
if links_html:
83+
links_p = f'\n <p class="publication-links">{links_html}</p>'
84+
85+
# Build citation paragraph if we have citation
86+
citation_p = ''
87+
if citation:
88+
citation_p = f'\n <p>{citation}</p>'
89+
90+
html = f''' <div class="publication-card">
91+
<img src="{image_src}" alt="Publication thumbnail">
92+
<div>
93+
<h4>{title_html}</h4>{citation_p}{links_p}
94+
</div>
95+
</div>'''
96+
97+
return html
98+
99+
100+
def generate_section_content(publications: List[Dict[str, Any]]) -> str:
101+
"""Generate HTML content for a publications section.
102+
103+
Args:
104+
publications: List of publication dictionaries
105+
106+
Returns:
107+
HTML string with all publication cards for the section
108+
"""
109+
cards = [generate_publication_card(pub) for pub in publications]
110+
return '\n\n'.join(cards)
111+
112+
113+
def build_publications(
114+
data_path: Path,
115+
template_path: Path,
116+
output_path: Path
117+
) -> None:
118+
"""Build publications.html from data and template.
119+
120+
Args:
121+
data_path: Path to publications.xlsx
122+
template_path: Path to template HTML file
123+
output_path: Path for generated HTML file
124+
"""
125+
# Load data
126+
data = load_publications(data_path)
127+
128+
# Generate content for each section
129+
replacements = {
130+
'PAPERS_CONTENT': generate_section_content(data.get('papers', [])),
131+
'CHAPTERS_CONTENT': generate_section_content(data.get('chapters', [])),
132+
'DISSERTATIONS_CONTENT': generate_section_content(data.get('dissertations', [])),
133+
'TALKS_CONTENT': generate_section_content(data.get('talks', [])),
134+
'COURSES_CONTENT': generate_section_content(data.get('courses', [])),
135+
'POSTERS_CONTENT': generate_section_content(data.get('posters', [])),
136+
}
137+
138+
# Inject into template
139+
inject_content(template_path, output_path, replacements)
140+
141+
# Report
142+
total = sum(len(items) for items in data.values())
143+
print(f"Generated {output_path} with {total} publications")
144+
145+
146+
def main():
147+
"""Main entry point for CLI usage."""
148+
project_root = Path(__file__).parent.parent
149+
data_path = project_root / 'data' / 'publications.xlsx'
150+
template_path = project_root / 'templates' / 'publications.html'
151+
output_path = project_root / 'publications.html'
152+
153+
build_publications(data_path, template_path, output_path)
154+
155+
156+
if __name__ == '__main__':
157+
main()

scripts/extract_publications.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#!/usr/bin/env python3
2+
"""Extract publication data from existing HTML into Excel spreadsheet.
3+
4+
This is a one-time script to migrate existing HTML data to the spreadsheet format.
5+
"""
6+
import re
7+
from pathlib import Path
8+
from bs4 import BeautifulSoup
9+
import openpyxl
10+
11+
12+
def extract_publications(html_path: Path) -> dict:
13+
"""Extract all publication data from HTML file.
14+
15+
Returns dict with keys: papers, chapters, dissertations, talks, courses, posters
16+
"""
17+
with open(html_path, 'r', encoding='utf-8') as f:
18+
soup = BeautifulSoup(f.read(), 'html.parser')
19+
20+
sections = {
21+
'papers': 'papers',
22+
'chapters': 'chapters',
23+
'dissertations': 'dissertations',
24+
'talks': 'talks',
25+
'courses': 'course-mats',
26+
'posters': 'posters'
27+
}
28+
29+
data = {}
30+
31+
for key, section_id in sections.items():
32+
section = soup.find('section', id=section_id)
33+
if not section:
34+
print(f"Warning: Section '{section_id}' not found")
35+
data[key] = []
36+
continue
37+
38+
cards = section.find_all('div', class_='publication-card')
39+
items = []
40+
41+
for card in cards:
42+
item = extract_card_data(card, key)
43+
items.append(item)
44+
45+
data[key] = items
46+
print(f"Extracted {len(items)} items from '{key}'")
47+
48+
return data
49+
50+
51+
def extract_card_data(card, section_type: str) -> dict:
52+
"""Extract data from a single publication card."""
53+
item = {}
54+
55+
# Get image
56+
img = card.find('img')
57+
if img:
58+
item['image'] = img.get('src', '').replace('images/publications/', '')
59+
else:
60+
item['image'] = ''
61+
62+
# Get title and title URL
63+
h4 = card.find('h4')
64+
if h4:
65+
link = h4.find('a')
66+
if link:
67+
item['title'] = link.get_text(strip=True)
68+
item['title_url'] = link.get('href', '')
69+
else:
70+
item['title'] = h4.get_text(strip=True)
71+
item['title_url'] = ''
72+
else:
73+
item['title'] = ''
74+
item['title_url'] = ''
75+
76+
# Get citation/description (first <p> that's not publication-links)
77+
div = card.find('div')
78+
if div:
79+
paragraphs = div.find_all('p', recursive=False)
80+
citation_p = None
81+
for p in paragraphs:
82+
if 'publication-links' not in p.get('class', []):
83+
citation_p = p
84+
break
85+
86+
if citation_p:
87+
# Get inner HTML to preserve formatting
88+
item['citation'] = get_inner_html(citation_p)
89+
else:
90+
item['citation'] = ''
91+
else:
92+
item['citation'] = ''
93+
94+
# Get publication links (PDF, CODE, DATA, etc.)
95+
links_p = card.find('p', class_='publication-links')
96+
if links_p:
97+
item['links_html'] = get_inner_html(links_p)
98+
else:
99+
item['links_html'] = ''
100+
101+
return item
102+
103+
104+
def get_inner_html(element) -> str:
105+
"""Get the inner HTML of an element as a string."""
106+
return ''.join(str(child) for child in element.children).strip()
107+
108+
109+
def save_to_excel(data: dict, output_path: Path):
110+
"""Save extracted data to Excel spreadsheet with multiple sheets."""
111+
wb = openpyxl.Workbook()
112+
113+
# Remove default sheet
114+
wb.remove(wb.active)
115+
116+
sheet_configs = [
117+
('papers', ['image', 'title', 'title_url', 'citation', 'links_html']),
118+
('chapters', ['image', 'title', 'title_url', 'citation', 'links_html']),
119+
('dissertations', ['image', 'title', 'title_url', 'citation', 'links_html']),
120+
('talks', ['image', 'title', 'title_url', 'citation', 'links_html']),
121+
('courses', ['image', 'title', 'title_url', 'citation', 'links_html']),
122+
('posters', ['image', 'title', 'title_url', 'citation', 'links_html']),
123+
]
124+
125+
for sheet_name, columns in sheet_configs:
126+
ws = wb.create_sheet(title=sheet_name)
127+
128+
# Write headers
129+
for col, header in enumerate(columns, 1):
130+
ws.cell(row=1, column=col, value=header)
131+
132+
# Write data
133+
items = data.get(sheet_name, [])
134+
for row_num, item in enumerate(items, 2):
135+
for col, header in enumerate(columns, 1):
136+
value = item.get(header, '')
137+
ws.cell(row=row_num, column=col, value=value)
138+
139+
# Adjust column widths
140+
for col in range(1, len(columns) + 1):
141+
ws.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 40
142+
143+
wb.save(output_path)
144+
print(f"Saved to {output_path}")
145+
146+
147+
def main():
148+
project_root = Path(__file__).parent.parent
149+
html_path = project_root / 'publications.html'
150+
output_path = project_root / 'data' / 'publications.xlsx'
151+
152+
print(f"Extracting from: {html_path}")
153+
data = extract_publications(html_path)
154+
155+
total = sum(len(items) for items in data.values())
156+
print(f"\nTotal items extracted: {total}")
157+
158+
save_to_excel(data, output_path)
159+
print("Done!")
160+
161+
162+
if __name__ == '__main__':
163+
main()

0 commit comments

Comments
 (0)