ContextLab
diff --git a/‎data/publications.xlsx‎
21.6 KB b/‎data/publications.xlsx‎
21.6 KB
diff --git a/‎publications.html‎
Lines changed: 68 additions & 68 deletions b/‎publications.html‎
Lines changed: 68 additions & 68 deletions
diff --git a/‎scripts/build_publications.py‎
Lines changed: 157 additions & 0 deletions b/‎scripts/build_publications.py‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎scripts/extract_publications.py‎
Lines changed: 163 additions & 0 deletions b/‎scripts/extract_publications.py‎
Lines changed: 163 additions & 0 deletions
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""Build publications.html from spreadsheet data.
+
+Reads data from data/publications.xlsx and generates publications.html
+using the template in templates/publications.html.
+"""
+from pathlib import Path
+from typing import List, Dict, Any
+import openpyxl
+
+from utils import inject_content
+
+
+def load_publications(xlsx_path: Path) -> Dict[str, List[Dict[str, Any]]]:
+    """Load all publication data from Excel spreadsheet.
+
+    Args:
+        xlsx_path: Path to the publications.xlsx file
+
+    Returns:
+        Dictionary with keys for each section (papers, chapters, etc.)
+        Each value is a list of publication dictionaries.
+    """
+    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
+
+    data = {}
+    for sheet_name in wb.sheetnames:
+        sheet = wb[sheet_name]
+
+        # Get headers from first row
+        headers = [cell.value for cell in sheet[1]]
+
+        rows = []
+        for row in sheet.iter_rows(min_row=2, values_only=True):
+            # Skip empty rows
+            if not any(cell is not None for cell in row):
+                continue
+
+            row_dict = {}
+            for header, value in zip(headers, row):
+                if value is None:
+                    row_dict[header] = ''
+                else:
+                    row_dict[header] = value
+            rows.append(row_dict)
+
+        data[sheet_name] = rows
+
+    wb.close()
+    return data
+
+
+def generate_publication_card(pub: Dict[str, Any]) -> str:
+    """Generate HTML for a single publication card.
+
+    Args:
+        pub: Dictionary with publication data (image, title, title_url, citation, links_html)
+
+    Returns:
+        HTML string for the publication card
+    """
+    image = pub.get('image', '')
+    title = pub.get('title', '')
+    title_url = pub.get('title_url', '')
+    citation = pub.get('citation', '')
+    links_html = pub.get('links_html', '')
+
+    # Build image path - prepend directory if not empty
+    if image:
+        image_src = f"images/publications/{image}"
+    else:
+        image_src = ""
+
+    # Build title HTML
+    if title_url:
+        title_html = f'<a href="{title_url}" target="_blank">{title}</a>'
+    else:
+        title_html = title
+
+    # Build links paragraph if we have links
+    links_p = ''
+    if links_html:
+        links_p = f'\n                        <p class="publication-links">{links_html}</p>'
+
+    # Build citation paragraph if we have citation
+    citation_p = ''
+    if citation:
+        citation_p = f'\n                        <p>{citation}</p>'
+
+    html = f'''                <div class="publication-card">
+                    <img src="{image_src}" alt="Publication thumbnail">
+                    <div>
+                        <h4>{title_html}</h4>{citation_p}{links_p}
+                    </div>
+                </div>'''
+
+    return html
+
+
+def generate_section_content(publications: List[Dict[str, Any]]) -> str:
+    """Generate HTML content for a publications section.
+
+    Args:
+        publications: List of publication dictionaries
+
+    Returns:
+        HTML string with all publication cards for the section
+    """
+    cards = [generate_publication_card(pub) for pub in publications]
+    return '\n\n'.join(cards)
+
+
+def build_publications(
+    data_path: Path,
+    template_path: Path,
+    output_path: Path
+) -> None:
+    """Build publications.html from data and template.
+
+    Args:
+        data_path: Path to publications.xlsx
+        template_path: Path to template HTML file
+        output_path: Path for generated HTML file
+    """
+    # Load data
+    data = load_publications(data_path)
+
+    # Generate content for each section
+    replacements = {
+        'PAPERS_CONTENT': generate_section_content(data.get('papers', [])),
+        'CHAPTERS_CONTENT': generate_section_content(data.get('chapters', [])),
+        'DISSERTATIONS_CONTENT': generate_section_content(data.get('dissertations', [])),
+        'TALKS_CONTENT': generate_section_content(data.get('talks', [])),
+        'COURSES_CONTENT': generate_section_content(data.get('courses', [])),
+        'POSTERS_CONTENT': generate_section_content(data.get('posters', [])),
+    }
+
+    # Inject into template
+    inject_content(template_path, output_path, replacements)
+
+    # Report
+    total = sum(len(items) for items in data.values())
+    print(f"Generated {output_path} with {total} publications")
+
+
+def main():
+    """Main entry point for CLI usage."""
+    project_root = Path(__file__).parent.parent
+    data_path = project_root / 'data' / 'publications.xlsx'
+    template_path = project_root / 'templates' / 'publications.html'
+    output_path = project_root / 'publications.html'
+
+    build_publications(data_path, template_path, output_path)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""Extract publication data from existing HTML into Excel spreadsheet.
+
+This is a one-time script to migrate existing HTML data to the spreadsheet format.
+"""
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+import openpyxl
+
+
+def extract_publications(html_path: Path) -> dict:
+    """Extract all publication data from HTML file.
+
+    Returns dict with keys: papers, chapters, dissertations, talks, courses, posters
+    """
+    with open(html_path, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f.read(), 'html.parser')
+
+    sections = {
+        'papers': 'papers',
+        'chapters': 'chapters',
+        'dissertations': 'dissertations',
+        'talks': 'talks',
+        'courses': 'course-mats',
+        'posters': 'posters'
+    }
+
+    data = {}
+
+    for key, section_id in sections.items():
+        section = soup.find('section', id=section_id)
+        if not section:
+            print(f"Warning: Section '{section_id}' not found")
+            data[key] = []
+            continue
+
+        cards = section.find_all('div', class_='publication-card')
+        items = []
+
+        for card in cards:
+            item = extract_card_data(card, key)
+            items.append(item)
+
+        data[key] = items
+        print(f"Extracted {len(items)} items from '{key}'")
+
+    return data
+
+
+def extract_card_data(card, section_type: str) -> dict:
+    """Extract data from a single publication card."""
+    item = {}
+
+    # Get image
+    img = card.find('img')
+    if img:
+        item['image'] = img.get('src', '').replace('images/publications/', '')
+    else:
+        item['image'] = ''
+
+    # Get title and title URL
+    h4 = card.find('h4')
+    if h4:
+        link = h4.find('a')
+        if link:
+            item['title'] = link.get_text(strip=True)
+            item['title_url'] = link.get('href', '')
+        else:
+            item['title'] = h4.get_text(strip=True)
+            item['title_url'] = ''
+    else:
+        item['title'] = ''
+        item['title_url'] = ''
+
+    # Get citation/description (first <p> that's not publication-links)
+    div = card.find('div')
+    if div:
+        paragraphs = div.find_all('p', recursive=False)
+        citation_p = None
+        for p in paragraphs:
+            if 'publication-links' not in p.get('class', []):
+                citation_p = p
+                break
+
+        if citation_p:
+            # Get inner HTML to preserve formatting
+            item['citation'] = get_inner_html(citation_p)
+        else:
+            item['citation'] = ''
+    else:
+        item['citation'] = ''
+
+    # Get publication links (PDF, CODE, DATA, etc.)
+    links_p = card.find('p', class_='publication-links')
+    if links_p:
+        item['links_html'] = get_inner_html(links_p)
+    else:
+        item['links_html'] = ''
+
+    return item
+
+
+def get_inner_html(element) -> str:
+    """Get the inner HTML of an element as a string."""
+    return ''.join(str(child) for child in element.children).strip()
+
+
+def save_to_excel(data: dict, output_path: Path):
+    """Save extracted data to Excel spreadsheet with multiple sheets."""
+    wb = openpyxl.Workbook()
+
+    # Remove default sheet
+    wb.remove(wb.active)
+
+    sheet_configs = [
+        ('papers', ['image', 'title', 'title_url', 'citation', 'links_html']),
+        ('chapters', ['image', 'title', 'title_url', 'citation', 'links_html']),
+        ('dissertations', ['image', 'title', 'title_url', 'citation', 'links_html']),
+        ('talks', ['image', 'title', 'title_url', 'citation', 'links_html']),
+        ('courses', ['image', 'title', 'title_url', 'citation', 'links_html']),
+        ('posters', ['image', 'title', 'title_url', 'citation', 'links_html']),
+    ]
+
+    for sheet_name, columns in sheet_configs:
+        ws = wb.create_sheet(title=sheet_name)
+
+        # Write headers
+        for col, header in enumerate(columns, 1):
+            ws.cell(row=1, column=col, value=header)
+
+        # Write data
+        items = data.get(sheet_name, [])
+        for row_num, item in enumerate(items, 2):
+            for col, header in enumerate(columns, 1):
+                value = item.get(header, '')
+                ws.cell(row=row_num, column=col, value=value)
+
+        # Adjust column widths
+        for col in range(1, len(columns) + 1):
+            ws.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 40
+
+    wb.save(output_path)
+    print(f"Saved to {output_path}")
+
+
+def main():
+    project_root = Path(__file__).parent.parent
+    html_path = project_root / 'publications.html'
+    output_path = project_root / 'data' / 'publications.xlsx'
+
+    print(f"Extracting from: {html_path}")
+    data = extract_publications(html_path)
+
+    total = sum(len(items) for items in data.values())
+    print(f"\nTotal items extracted: {total}")
+
+    save_to_excel(data, output_path)
+    print("Done!")
+
+
+if __name__ == '__main__':
+    main()