From d0b0be6850aceb99252c3be509cf9eb6513e9a01 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Mar 2026 03:03:30 +0000 Subject: [PATCH 1/2] Add Congressional Record ingestion, display, and LLM summaries Full-stack implementation for Congressional Record support: Backend: - New SQLAlchemy models (crec_issue, crec_granule, crec_speech, crec_bill_reference, crec_summary) with Alembic migration - Importer for downloading/parsing daily CREC bulk data from govinfo.gov with speaker resolution and bill reference extraction - Extended cite_parser with bill reference regex (H.R., S., etc.) - LLM summarizer for granule-level and daily debate summaries - FastAPI endpoints for browsing issues, granules, speaker stats, and activity calendar Frontend: - Prisma schema models for all new tables - tRPC router with list, getIssue, getGranule, speakerStats, activityCalendar, debatesForBill, and legislatorStats procedures - /congress/record browse page with date-based issue listing - /congress/record/[issueId] daily view with Senate/House sections - /congress/record/[issueId]/[granuleId] transcript with speech cards, speaker attribution, bill reference chips, and AI summary - /congress/record/stats speaker leaderboard with word count rankings - Bill detail "Debates" tab showing Congressional Record mentions - Legislator detail speaking stats card with recent speeches - SpeechCard component with party colors, avatars, and bill ref links https://claude.ai/code/session_01EkXe8D2h9DAxnQbiWsJyAm --- .../b2c3d4e5f6a7_congressional_record.py | 101 ++++ backend/congress_db/models.py | 138 ++++++ backend/congress_fastapi/app.py | 2 + .../handlers/congressional_record.py | 233 +++++++++ .../models/congressional_record.py | 92 ++++ .../routes/congressional_record.py | 94 ++++ .../importers/congressional_record.py | 460 ++++++++++++++++++ .../prompt_runners/crec_summarizer.py | 257 ++++++++++ backend/congress_parser/utils/cite_parser.py | 72 +++ hillstack/prisma/schema.prisma | 98 ++++ .../congress/bills/[billId]/debates/page.tsx | 162 ++++++ .../legislators/[bioguideId]/page.tsx | 2 + .../record/[issueId]/[granuleId]/page.tsx | 86 ++++ .../app/congress/record/[issueId]/page.tsx | 143 ++++++ hillstack/src/app/congress/record/page.tsx | 99 ++++ .../src/app/congress/record/stats/page.tsx | 139 ++++++ .../record/LegislatorSpeakingStats.tsx | 155 ++++++ .../src/components/record/SpeechCard.tsx | 117 +++++ hillstack/src/constants/navigation.tsx | 12 + hillstack/src/server/api/root.ts | 2 + .../server/api/routers/congressionalRecord.ts | 407 ++++++++++++++++ 21 files changed, 2871 insertions(+) create mode 100644 backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py create mode 100644 backend/congress_fastapi/handlers/congressional_record.py create mode 100644 backend/congress_fastapi/models/congressional_record.py create mode 100644 backend/congress_fastapi/routes/congressional_record.py create mode 100644 backend/congress_parser/importers/congressional_record.py create mode 100644 backend/congress_parser/prompt_runners/crec_summarizer.py create mode 100644 hillstack/src/app/congress/bills/[billId]/debates/page.tsx create mode 100644 hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx create mode 100644 hillstack/src/app/congress/record/[issueId]/page.tsx create mode 100644 hillstack/src/app/congress/record/page.tsx create mode 100644 hillstack/src/app/congress/record/stats/page.tsx create mode 100644 hillstack/src/components/record/LegislatorSpeakingStats.tsx create mode 100644 hillstack/src/components/record/SpeechCard.tsx create mode 100644 hillstack/src/server/api/routers/congressionalRecord.ts diff --git a/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py new file mode 100644 index 00000000..cd7789b5 --- /dev/null +++ b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py @@ -0,0 +1,101 @@ +"""Add Congressional Record tables + +Revision ID: b2c3d4e5f6a7 +Revises: a1b2c3d4e5f6 +Create Date: 2026-03-24 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +revision: str = 'b2c3d4e5f6a7' +down_revision: Union[str, None] = 'a1b2c3d4e5f6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create CRECSection enum type + crec_section = sa.Enum('Senate', 'House', 'Extensions', 'DailyDigest', name='crecsection') + crec_section.create(op.get_bind(), checkfirst=True) + + op.create_table( + 'crec_issue', + sa.Column('crec_issue_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('issue_date', sa.Date(), nullable=True), + sa.Column('congress_id', sa.Integer(), sa.ForeignKey('congress.congress_id', ondelete='CASCADE'), nullable=True), + sa.Column('package_id', sa.String(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_issue_issue_date', 'crec_issue', ['issue_date'], unique=True) + op.create_index('ix_crec_issue_congress_id', 'crec_issue', ['congress_id']) + op.create_unique_constraint('uq_crec_issue_package_id', 'crec_issue', ['package_id']) + + op.create_table( + 'crec_granule', + sa.Column('crec_granule_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_issue_id', sa.Integer(), sa.ForeignKey('crec_issue.crec_issue_id', ondelete='CASCADE'), nullable=True), + sa.Column('granule_id', sa.String(), nullable=True), + sa.Column('section', crec_section, nullable=True), + sa.Column('title', sa.String(), nullable=True), + sa.Column('page_start', sa.String(), nullable=True), + sa.Column('page_end', sa.String(), nullable=True), + sa.Column('order_number', sa.Integer(), default=0), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_granule_crec_issue_id', 'crec_granule', ['crec_issue_id']) + op.create_index('ix_crec_granule_section', 'crec_granule', ['section']) + op.create_unique_constraint('uq_crec_granule_granule_id', 'crec_granule', ['granule_id']) + + op.create_table( + 'crec_speech', + sa.Column('crec_speech_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_granule_id', sa.Integer(), sa.ForeignKey('crec_granule.crec_granule_id', ondelete='CASCADE'), nullable=True), + sa.Column('speaker_raw', sa.String(), nullable=True), + sa.Column('legislator_bioguide_id', sa.String(), sa.ForeignKey('legislator.bioguide_id', ondelete='SET NULL'), nullable=True), + sa.Column('order_number', sa.Integer(), default=0), + sa.Column('content_text', sa.String(), nullable=True), + sa.Column('word_count', sa.Integer(), default=0), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_speech_crec_granule_id', 'crec_speech', ['crec_granule_id']) + op.create_index('ix_crec_speech_legislator_bioguide_id', 'crec_speech', ['legislator_bioguide_id']) + + op.create_table( + 'crec_bill_reference', + sa.Column('crec_bill_reference_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_speech_id', sa.Integer(), sa.ForeignKey('crec_speech.crec_speech_id', ondelete='CASCADE'), nullable=True), + sa.Column('legislation_id', sa.Integer(), sa.ForeignKey('legislation.legislation_id', ondelete='SET NULL'), nullable=True), + sa.Column('cite_text', sa.String(), nullable=True), + sa.Column('cite_type', sa.String(), nullable=True), + sa.Column('start_offset', sa.Integer(), nullable=True), + sa.Column('end_offset', sa.Integer(), nullable=True), + ) + op.create_index('ix_crec_bill_reference_crec_speech_id', 'crec_bill_reference', ['crec_speech_id']) + op.create_index('ix_crec_bill_reference_legislation_id', 'crec_bill_reference', ['legislation_id']) + + op.create_table( + 'crec_summary', + sa.Column('crec_summary_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_granule_id', sa.Integer(), sa.ForeignKey('crec_granule.crec_granule_id', ondelete='CASCADE'), nullable=True), + sa.Column('crec_issue_id', sa.Integer(), sa.ForeignKey('crec_issue.crec_issue_id', ondelete='CASCADE'), nullable=True), + sa.Column('summary', sa.String(), nullable=True), + sa.Column('summary_type', sa.String(), nullable=True), + sa.Column('prompt_batch_id', sa.Integer(), sa.ForeignKey('prompts.prompt_batch.prompt_batch_id', ondelete='CASCADE'), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + schema='prompts', + ) + op.create_index('ix_crec_summary_crec_granule_id', 'crec_summary', ['crec_granule_id'], schema='prompts') + op.create_index('ix_crec_summary_crec_issue_id', 'crec_summary', ['crec_issue_id'], schema='prompts') + op.create_index('ix_crec_summary_prompt_batch_id', 'crec_summary', ['prompt_batch_id'], schema='prompts') + + +def downgrade() -> None: + op.drop_table('crec_summary', schema='prompts') + op.drop_table('crec_bill_reference') + op.drop_table('crec_speech') + op.drop_table('crec_granule') + op.drop_table('crec_issue') + sa.Enum(name='crecsection').drop(op.get_bind(), checkfirst=True) diff --git a/backend/congress_db/models.py b/backend/congress_db/models.py index fb2ca6d0..c9df9a95 100644 --- a/backend/congress_db/models.py +++ b/backend/congress_db/models.py @@ -45,6 +45,13 @@ def bind_expression(self, bindvalue): return sa.cast(bindvalue, self) +class CRECSection(str, enum.Enum): + Senate = "Senate" + House = "House" + Extensions = "Extensions" + DailyDigest = "DailyDigest" + + class LegislatorJob(str, enum.Enum): Senator = "Senator" Representative = "Representative" @@ -1262,6 +1269,137 @@ class Appropriation(AppropriationsBase): purpose = Column(String, default="") +class CRECIssue(Base): + """ + One row per daily Congressional Record issue + """ + + __tablename__ = "crec_issue" + + crec_issue_id = Column(Integer, primary_key=True) + issue_date = Column(Date, unique=True, index=True) + congress_id = Column( + Integer, ForeignKey("congress.congress_id", ondelete="CASCADE"), index=True + ) + package_id = Column(String, unique=True) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + granules = relationship("CRECGranule", back_populates="issue") + + +class CRECGranule(Base): + """ + A discrete item/debate/segment within a daily Congressional Record issue + """ + + __tablename__ = "crec_granule" + + crec_granule_id = Column(Integer, primary_key=True) + crec_issue_id = Column( + Integer, + ForeignKey("crec_issue.crec_issue_id", ondelete="CASCADE"), + index=True, + ) + granule_id = Column(String, unique=True) + section = Column(Enum(CRECSection), index=True) + title = Column(String) + page_start = Column(String, nullable=True) + page_end = Column(String, nullable=True) + order_number = Column(Integer, default=0) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + issue = relationship("CRECIssue", back_populates="granules") + speeches = relationship("CRECSpeech", back_populates="granule") + + +class CRECSpeech(Base): + """ + Individual speech segment within a granule, with speaker attribution + """ + + __tablename__ = "crec_speech" + + crec_speech_id = Column(Integer, primary_key=True) + crec_granule_id = Column( + Integer, + ForeignKey("crec_granule.crec_granule_id", ondelete="CASCADE"), + index=True, + ) + speaker_raw = Column(String, nullable=True) + legislator_bioguide_id = Column( + String, + ForeignKey("legislator.bioguide_id", ondelete="SET NULL"), + index=True, + nullable=True, + ) + order_number = Column(Integer, default=0) + content_text = Column(String) + word_count = Column(Integer, default=0) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + granule = relationship("CRECGranule", back_populates="speeches") + bill_references = relationship("CRECBillReference", back_populates="speech") + + +class CRECBillReference(Base): + """ + Bill citations found within speech text in the Congressional Record + """ + + __tablename__ = "crec_bill_reference" + + crec_bill_reference_id = Column(Integer, primary_key=True) + crec_speech_id = Column( + Integer, + ForeignKey("crec_speech.crec_speech_id", ondelete="CASCADE"), + index=True, + ) + legislation_id = Column( + Integer, + ForeignKey("legislation.legislation_id", ondelete="SET NULL"), + index=True, + nullable=True, + ) + cite_text = Column(String) + cite_type = Column(String) + start_offset = Column(Integer) + end_offset = Column(Integer) + + speech = relationship("CRECSpeech", back_populates="bill_references") + + +class CRECSummary(PromptsBase): + """ + LLM-generated summaries of Congressional Record debates + """ + + __tablename__ = "crec_summary" + __table_args__ = {"schema": "prompts"} + + crec_summary_id = Column(Integer, primary_key=True) + crec_granule_id = Column( + Integer, + ForeignKey("crec_granule.crec_granule_id", ondelete="CASCADE"), + index=True, + nullable=True, + ) + crec_issue_id = Column( + Integer, + ForeignKey("crec_issue.crec_issue_id", ondelete="CASCADE"), + index=True, + nullable=True, + ) + summary = Column(String) + summary_type = Column(String) + prompt_batch_id = Column( + Integer, + ForeignKey(PromptBatch.prompt_batch_id, ondelete="CASCADE"), + index=True, + nullable=True, + ) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + class User(AuthenticationBase): __tablename__ = "user" diff --git a/backend/congress_fastapi/app.py b/backend/congress_fastapi/app.py index 120aa994..450a9542 100644 --- a/backend/congress_fastapi/app.py +++ b/backend/congress_fastapi/app.py @@ -13,6 +13,7 @@ from congress_fastapi.routes.stats import router as stats_router from congress_fastapi.routes.uscode import router as uscode_router from congress_fastapi.routes.committees import router as committees_router +from congress_fastapi.routes.congressional_record import router as crec_router from congress_fastapi.utils.limiter import limiter from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded @@ -56,4 +57,5 @@ async def log_exceptions_middleware(request: Request, call_next): app.include_router(stats_router) app.include_router(uscode_router) app.include_router(committees_router) +app.include_router(crec_router) print("Loaded") diff --git a/backend/congress_fastapi/handlers/congressional_record.py b/backend/congress_fastapi/handlers/congressional_record.py new file mode 100644 index 00000000..e6c4c6a0 --- /dev/null +++ b/backend/congress_fastapi/handlers/congressional_record.py @@ -0,0 +1,233 @@ +from typing import List, Optional, Tuple +from datetime import date + +from sqlalchemy import select, func, desc, and_ + +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, + CRECSummary, + Legislator, +) +from congress_fastapi.db.postgres import get_database +from congress_fastapi.models.congressional_record import ( + CRECIssueInfo, + CRECGranuleInfo, + CRECSpeechDetailInfo, + CRECBillReferenceInfo, + CRECSpeakerStatInfo, + CRECActivityItem, +) + + +async def get_issues( + start_date: Optional[date], + end_date: Optional[date], + chamber: Optional[str], + page: int = 1, + page_size: int = 20, +) -> Tuple[List[CRECIssueInfo], int]: + database = await get_database() + + query = select(*CRECIssueInfo.sqlalchemy_columns()).select_from(CRECIssue) + count_query = select(func.count(CRECIssue.crec_issue_id)) + + if start_date: + query = query.where(CRECIssue.issue_date >= start_date) + count_query = count_query.where(CRECIssue.issue_date >= start_date) + if end_date: + query = query.where(CRECIssue.issue_date <= end_date) + count_query = count_query.where(CRECIssue.issue_date <= end_date) + + total = await database.fetch_val(count_query) + + query = query.order_by(desc(CRECIssue.issue_date)) + query = query.limit(page_size).offset((page - 1) * page_size) + + results = await database.fetch_all(query) + return [CRECIssueInfo(**r) for r in results], total + + +async def get_issue_detail(issue_id: int): + database = await get_database() + + issue_query = select(*CRECIssueInfo.sqlalchemy_columns()).where( + CRECIssue.crec_issue_id == issue_id + ) + issue = await database.fetch_one(issue_query) + if not issue: + return None, [] + + granules_query = ( + select(*CRECGranuleInfo.sqlalchemy_columns()) + .where(CRECGranule.crec_issue_id == issue_id) + .order_by(CRECGranule.order_number) + ) + granules = await database.fetch_all(granules_query) + + return CRECIssueInfo(**issue), [CRECGranuleInfo(**g) for g in granules] + + +async def get_granule_detail(granule_id: int): + database = await get_database() + + granule_query = select(*CRECGranuleInfo.sqlalchemy_columns()).where( + CRECGranule.crec_granule_id == granule_id + ) + granule = await database.fetch_one(granule_query) + if not granule: + return None, [], None + + speeches_query = ( + select( + CRECSpeech.crec_speech_id, + CRECSpeech.speaker_raw, + CRECSpeech.legislator_bioguide_id, + CRECSpeech.order_number, + CRECSpeech.content_text, + CRECSpeech.word_count, + ) + .where(CRECSpeech.crec_granule_id == granule_id) + .order_by(CRECSpeech.order_number) + ) + speech_rows = await database.fetch_all(speeches_query) + + speeches = [] + for row in speech_rows: + refs_query = ( + select( + CRECBillReference.crec_bill_reference_id, + CRECBillReference.legislation_id, + CRECBillReference.cite_text, + CRECBillReference.cite_type, + CRECBillReference.start_offset, + CRECBillReference.end_offset, + ) + .where(CRECBillReference.crec_speech_id == row["crec_speech_id"]) + ) + ref_rows = await database.fetch_all(refs_query) + + speech = CRECSpeechDetailInfo( + crec_speech_id=row["crec_speech_id"], + speaker_raw=row["speaker_raw"], + legislator_bioguide_id=row["legislator_bioguide_id"], + order_number=row["order_number"], + content_text=row["content_text"], + word_count=row["word_count"], + bill_references=[CRECBillReferenceInfo(**r) for r in ref_rows], + ) + speeches.append(speech) + + summary_query = ( + select(CRECSummary.summary) + .where( + CRECSummary.crec_granule_id == granule_id, + CRECSummary.summary_type == "granule", + ) + .limit(1) + ) + summary_row = await database.fetch_one(summary_query) + summary = summary_row["summary"] if summary_row else None + + return CRECGranuleInfo(**granule), speeches, summary + + +async def get_speaker_stats( + start_date: Optional[date], + end_date: Optional[date], + chamber: Optional[str], + limit: int = 20, + page: int = 1, +) -> Tuple[List[CRECSpeakerStatInfo], int]: + database = await get_database() + + conditions = [CRECSpeech.legislator_bioguide_id.isnot(None)] + + if start_date or end_date or chamber: + join_conditions = [] + if start_date: + join_conditions.append(CRECIssue.issue_date >= start_date) + if end_date: + join_conditions.append(CRECIssue.issue_date <= end_date) + + query = ( + select( + CRECSpeech.legislator_bioguide_id.label("bioguide_id"), + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + func.sum(CRECSpeech.word_count).label("total_words"), + func.count(CRECSpeech.crec_speech_id).label("speech_count"), + ) + .join(CRECGranule, CRECSpeech.crec_granule_id == CRECGranule.crec_granule_id) + .join(CRECIssue, CRECGranule.crec_issue_id == CRECIssue.crec_issue_id) + .join(Legislator, CRECSpeech.legislator_bioguide_id == Legislator.bioguide_id) + .where(and_(*conditions, *join_conditions)) + ) + + if chamber: + query = query.where(CRECGranule.section == chamber) + else: + query = ( + select( + CRECSpeech.legislator_bioguide_id.label("bioguide_id"), + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + func.sum(CRECSpeech.word_count).label("total_words"), + func.count(CRECSpeech.crec_speech_id).label("speech_count"), + ) + .join(Legislator, CRECSpeech.legislator_bioguide_id == Legislator.bioguide_id) + .where(and_(*conditions)) + ) + + query = query.group_by( + CRECSpeech.legislator_bioguide_id, + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + ) + + count_subquery = query.subquery() + total = await database.fetch_val( + select(func.count()).select_from(count_subquery) + ) + + query = query.order_by(desc("total_words")) + query = query.limit(limit).offset((page - 1) * limit) + + results = await database.fetch_all(query) + return [CRECSpeakerStatInfo(**r) for r in results], total + + +async def get_activity_calendar( + start_date: Optional[date] = None, + end_date: Optional[date] = None, +) -> List[CRECActivityItem]: + database = await get_database() + + query = ( + select( + CRECIssue.issue_date.label("date"), + func.count(CRECGranule.crec_granule_id).label("count"), + ) + .join(CRECGranule, CRECIssue.crec_issue_id == CRECGranule.crec_issue_id) + .group_by(CRECIssue.issue_date) + .order_by(CRECIssue.issue_date) + ) + + if start_date: + query = query.where(CRECIssue.issue_date >= start_date) + if end_date: + query = query.where(CRECIssue.issue_date <= end_date) + + results = await database.fetch_all(query) + return [CRECActivityItem(**r) for r in results] diff --git a/backend/congress_fastapi/models/congressional_record.py b/backend/congress_fastapi/models/congressional_record.py new file mode 100644 index 00000000..0f957536 --- /dev/null +++ b/backend/congress_fastapi/models/congressional_record.py @@ -0,0 +1,92 @@ +from typing import Annotated, List, Optional +from datetime import date, datetime + +from pydantic import BaseModel +from congress_fastapi.models.abstract import MappableBase +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, +) + + +class CRECIssueInfo(MappableBase): + crec_issue_id: Annotated[int, CRECIssue.crec_issue_id] + issue_date: Annotated[Optional[date], CRECIssue.issue_date] + congress_id: Annotated[Optional[int], CRECIssue.congress_id] + package_id: Annotated[Optional[str], CRECIssue.package_id] + + +class CRECIssueListResponse(BaseModel): + issues: List[CRECIssueInfo] + total_results: int + + +class CRECGranuleInfo(MappableBase): + crec_granule_id: Annotated[int, CRECGranule.crec_granule_id] + crec_issue_id: Annotated[Optional[int], CRECGranule.crec_issue_id] + granule_id: Annotated[Optional[str], CRECGranule.granule_id] + section: Annotated[Optional[str], CRECGranule.section] + title: Annotated[Optional[str], CRECGranule.title] + page_start: Annotated[Optional[str], CRECGranule.page_start] + page_end: Annotated[Optional[str], CRECGranule.page_end] + order_number: Annotated[Optional[int], CRECGranule.order_number] + + +class CRECIssueDetailResponse(BaseModel): + issue: CRECIssueInfo + granules: List[CRECGranuleInfo] + + +class CRECSpeechInfo(BaseModel): + crec_speech_id: int + speaker_raw: Optional[str] = None + legislator_bioguide_id: Optional[str] = None + order_number: Optional[int] = None + content_text: Optional[str] = None + word_count: Optional[int] = None + + +class CRECBillReferenceInfo(BaseModel): + crec_bill_reference_id: int + legislation_id: Optional[int] = None + cite_text: Optional[str] = None + cite_type: Optional[str] = None + start_offset: Optional[int] = None + end_offset: Optional[int] = None + + +class CRECSpeechDetailInfo(CRECSpeechInfo): + bill_references: List[CRECBillReferenceInfo] = [] + + +class CRECGranuleDetailResponse(BaseModel): + granule: CRECGranuleInfo + speeches: List[CRECSpeechDetailInfo] + summary: Optional[str] = None + + +class CRECSpeakerStatInfo(BaseModel): + bioguide_id: str + first_name: Optional[str] = None + last_name: Optional[str] = None + party: Optional[str] = None + state: Optional[str] = None + image_url: Optional[str] = None + total_words: int + speech_count: int + + +class CRECSpeakerStatsResponse(BaseModel): + speakers: List[CRECSpeakerStatInfo] + total_results: int + + +class CRECActivityItem(BaseModel): + date: date + count: int + + +class CRECActivityResponse(BaseModel): + activity: List[CRECActivityItem] diff --git a/backend/congress_fastapi/routes/congressional_record.py b/backend/congress_fastapi/routes/congressional_record.py new file mode 100644 index 00000000..2f956036 --- /dev/null +++ b/backend/congress_fastapi/routes/congressional_record.py @@ -0,0 +1,94 @@ +from typing import List, Optional +from datetime import date + +from fastapi import APIRouter, HTTPException, Query, status + +from congress_fastapi.handlers.congressional_record import ( + get_issues, + get_issue_detail, + get_granule_detail, + get_speaker_stats, + get_activity_calendar, +) +from congress_fastapi.models.congressional_record import ( + CRECIssueListResponse, + CRECIssueDetailResponse, + CRECGranuleDetailResponse, + CRECSpeakerStatsResponse, + CRECActivityResponse, +) +from congress_fastapi.models.errors import Error + +router = APIRouter(tags=["Congressional Record"]) + + +@router.get("/congressional-record") +async def list_issues( + page: int = Query(1, description="Page number"), + page_size: int = Query(20, alias="pageSize"), + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), + chamber: Optional[str] = Query(None, description="Filter by chamber section"), +) -> CRECIssueListResponse: + """List Congressional Record daily issues with pagination and date filtering.""" + issues, total = await get_issues(start_date, end_date, chamber, page, page_size) + return CRECIssueListResponse(issues=issues, total_results=total) + + +@router.get( + "/congressional-record/{issue_id}", + responses={ + status.HTTP_404_NOT_FOUND: {"model": Error, "detail": "Issue not found"}, + }, +) +async def get_issue(issue_id: int) -> CRECIssueDetailResponse: + """Get a single Congressional Record issue with its granules.""" + issue, granules = await get_issue_detail(issue_id) + if issue is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Issue not found" + ) + return CRECIssueDetailResponse(issue=issue, granules=granules) + + +@router.get( + "/congressional-record/{issue_id}/granule/{granule_id}", + responses={ + status.HTTP_404_NOT_FOUND: {"model": Error, "detail": "Granule not found"}, + }, +) +async def get_granule(issue_id: int, granule_id: int) -> CRECGranuleDetailResponse: + """Get a granule with all its speeches and bill references.""" + granule, speeches, summary = await get_granule_detail(granule_id) + if granule is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Granule not found" + ) + return CRECGranuleDetailResponse( + granule=granule, speeches=speeches, summary=summary + ) + + +@router.get("/congressional-record/stats/speakers") +async def speaker_statistics( + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), + chamber: Optional[str] = Query(None), + limit: int = Query(20), + page: int = Query(1), +) -> CRECSpeakerStatsResponse: + """Get speaker statistics - who talked the most by word count.""" + speakers, total = await get_speaker_stats( + start_date, end_date, chamber, limit, page + ) + return CRECSpeakerStatsResponse(speakers=speakers, total_results=total) + + +@router.get("/congressional-record/stats/activity") +async def activity_calendar( + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), +) -> CRECActivityResponse: + """Get activity calendar data for heatmap visualization.""" + activity = await get_activity_calendar(start_date, end_date) + return CRECActivityResponse(activity=activity) diff --git a/backend/congress_parser/importers/congressional_record.py b/backend/congress_parser/importers/congressional_record.py new file mode 100644 index 00000000..4ceeb8a2 --- /dev/null +++ b/backend/congress_parser/importers/congressional_record.py @@ -0,0 +1,460 @@ +""" +Congressional Record importer. + +Downloads and parses daily Congressional Record bulk data from govinfo.gov. +Each daily issue is a ZIP containing HTML/XML files organized by section +(Senate, House, Extensions of Remarks, Daily Digest). + +Bulk data URL pattern: + https://www.govinfo.gov/bulkdata/CREC/{year}/{month:02d}/{day:02d}/CREC-{year}-{month:02d}-{day:02d}.zip + +The importer: + 1. Iterates through dates in the current Congress (119th, starting Jan 2025) + 2. Downloads daily ZIP packages + 3. Parses HTML/XML granules to extract debate segments + 4. Identifies speakers and resolves them to legislator bioguide IDs + 5. Extracts bill references from speech text + 6. Stores everything in the crec_* tables + +Usage: + python -m congress_parser.importers.congressional_record +""" + +from datetime import datetime, date, timedelta +import io +import logging +import os +import re +import requests +import tempfile +import zipfile + +from lxml import etree, html +from unidecode import unidecode + +from congress_db.session import Session +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, + CRECSection, + Congress, + Legislator, + Legislation, + LegislationChamber, + LegislationType, +) +from congress_parser.utils.cite_parser import extract_bill_references + +logger = logging.getLogger(__name__) + +BULK_DATA_URL = "https://www.govinfo.gov/bulkdata/CREC/{year}/{month:02d}/{day:02d}/CREC-{year}-{month:02d}-{day:02d}.zip" + + +def calculate_congress_from_year() -> int: + current_year = datetime.now().year + return ((current_year - 2001) // 2) + 107 + + +def get_congress_start_date(congress_number: int) -> date: + """Get the start date for a given Congress number.""" + start_year = 2001 + (congress_number - 107) * 2 + return date(start_year, 1, 3) + + +def map_section(filename: str) -> CRECSection: + """Map a CREC filename or path to a CRECSection enum.""" + lower = filename.lower() + if "/senate/" in lower or "senate" in lower: + return CRECSection.Senate + elif "/house/" in lower or "house" in lower: + return CRECSection.House + elif "/extensions/" in lower or "extension" in lower: + return CRECSection.Extensions + elif "/dailydigest/" in lower or "digest" in lower: + return CRECSection.DailyDigest + return CRECSection.Senate + + +SPEAKER_PATTERN = re.compile( + r"^(?:Mr|Mrs|Ms|Miss|Madam|The)\.\s+([A-Z][A-Z\-\']+(?:\s+[A-Z][A-Z\-\']+)*)", + re.IGNORECASE, +) + +SPEAKER_PREFIX_PATTERN = re.compile( + r"^(?:Mr|Mrs|Ms|Miss)\.\s+(?:Speaker|President|SPEAKER|PRESIDENT)?\s*[,.]?\s*(?:Mr|Mrs|Ms|Miss)\.\s+([A-Z][A-Z\-\']+)", + re.IGNORECASE, +) + + +def extract_speaker_name(text: str) -> str: + """Extract speaker last name from a speech paragraph.""" + text = text.strip() + match = SPEAKER_PREFIX_PATTERN.match(text) + if match: + return match.group(1).upper() + match = SPEAKER_PATTERN.match(text) + if match: + name = match.group(1).upper() + if name not in ("SPEAKER", "PRESIDENT", "CHAIR", "CHAIRMAN", "CHAIRWOMAN"): + return name + return "" + + +def resolve_speaker(speaker_name: str, chamber: CRECSection, session) -> str: + """ + Resolve a raw speaker last name to a legislator bioguide_id. + Uses chamber to disambiguate between House and Senate members. + """ + if not speaker_name: + return None + + from congress_db.models import LegislatorJob + + job = None + if chamber == CRECSection.Senate: + job = LegislatorJob.Senator + elif chamber == CRECSection.House: + job = LegislatorJob.Representative + + query = session.query(Legislator).filter( + Legislator.last_name == speaker_name.title() + ) + if job: + query = query.filter(Legislator.job == job) + + legislators = query.all() + if len(legislators) == 1: + return legislators[0].bioguide_id + elif len(legislators) > 1: + current_congress = calculate_congress_from_year() + for leg in legislators: + if leg.congress_id and current_congress in leg.congress_id: + return leg.bioguide_id + return legislators[0].bioguide_id + + return None + + +def resolve_bill_reference(ref: dict, congress_id: int, session) -> int: + """Resolve a bill reference dict to a legislation_id, or None.""" + chamber_map = { + "House": LegislationChamber.House, + "Senate": LegislationChamber.Senate, + } + type_map = { + "Bill": LegislationType.Bill, + "Resolution": LegislationType.Res, + "Joint Resolution": LegislationType.JRes, + "Continuing Resolution": LegislationType.CRes, + } + + chamber = chamber_map.get(ref["chamber"]) + leg_type = type_map.get(ref["legislation_type"]) + + if not chamber or not leg_type: + return None + + legislation = ( + session.query(Legislation) + .filter( + Legislation.chamber == chamber, + Legislation.number == ref["number"], + Legislation.legislation_type == leg_type, + Legislation.congress_id == congress_id, + ) + .first() + ) + return legislation.legislation_id if legislation else None + + +def parse_htm_content(content: str) -> list: + """ + Parse an HTML granule file and extract speech segments. + Returns a list of dicts: {speaker_raw, content_text, order_number} + """ + try: + doc = html.fromstring(content) + except Exception: + return [] + + speeches = [] + current_speaker = "" + current_paragraphs = [] + order = 0 + + paragraphs = doc.xpath("//body//p") or doc.xpath("//p") + if not paragraphs: + text = doc.text_content().strip() + if text: + speeches.append({ + "speaker_raw": "", + "content_text": text, + "order_number": 0, + }) + return speeches + + for p in paragraphs: + text = p.text_content().strip() + if not text: + continue + + text = unidecode(text) + speaker = extract_speaker_name(text) + + if speaker and speaker != current_speaker: + if current_paragraphs: + full_text = "\n".join(current_paragraphs) + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) + order += 1 + current_speaker = speaker + current_paragraphs = [text] + else: + current_paragraphs.append(text) + + if current_paragraphs: + full_text = "\n".join(current_paragraphs) + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) + + return speeches + + +def parse_mods_xml(mods_content: str) -> dict: + """ + Parse the MODS XML metadata file for a daily CREC package. + Returns a dict of granule_id -> {title, section, page_start, page_end}. + """ + try: + root = etree.fromstring(mods_content) + except Exception: + return {} + + ns = {"mods": "http://www.loc.gov/mods/v3"} + granules = {} + + for related in root.findall(".//mods:relatedItem[@type='constituent']", ns): + identifier_el = related.find("mods:identifier[@type='preferred citation']", ns) + title_el = related.find("mods:titleInfo/mods:title", ns) + + if identifier_el is None: + continue + + granule_id = identifier_el.text.strip() if identifier_el.text else None + title = title_el.text.strip() if title_el is not None and title_el.text else "" + + page_start = None + page_end = None + extent_el = related.find("mods:part/mods:extent", ns) + if extent_el is not None: + start_el = extent_el.find("mods:start", ns) + end_el = extent_el.find("mods:end", ns) + if start_el is not None and start_el.text: + page_start = start_el.text.strip() + if end_el is not None and end_el.text: + page_end = end_el.text.strip() + + if granule_id: + granules[granule_id] = { + "title": title, + "page_start": page_start, + "page_end": page_end, + } + + return granules + + +def import_daily_record(issue_date: date, session, congress_id: int): + """Import a single day's Congressional Record.""" + package_id = f"CREC-{issue_date.isoformat()}" + + existing = session.query(CRECIssue).filter( + CRECIssue.package_id == package_id + ).first() + if existing: + logger.info(f"Skipping {package_id} - already imported") + return + + url = BULK_DATA_URL.format( + year=issue_date.year, + month=issue_date.month, + day=issue_date.day, + ) + + logger.info(f"Downloading {url}") + try: + resp = requests.get(url, timeout=60) + except requests.RequestException as e: + logger.warning(f"Failed to download {url}: {e}") + return + + if resp.status_code == 404: + logger.debug(f"No record for {issue_date} (404)") + return + if resp.status_code != 200: + logger.warning(f"Unexpected status {resp.status_code} for {url}") + return + + issue = CRECIssue( + issue_date=issue_date, + congress_id=congress_id, + package_id=package_id, + ) + session.add(issue) + session.flush() + + try: + zf = zipfile.ZipFile(io.BytesIO(resp.content)) + except zipfile.BadZipFile: + logger.warning(f"Bad ZIP file for {package_id}") + session.rollback() + return + + mods_metadata = {} + for name in zf.namelist(): + if name.endswith("mods.xml"): + try: + mods_content = zf.read(name) + mods_metadata = parse_mods_xml(mods_content) + except Exception as e: + logger.warning(f"Failed to parse MODS {name}: {e}") + break + + htm_files = sorted([ + n for n in zf.namelist() + if n.endswith(".htm") or n.endswith(".html") + ]) + + speaker_cache = {} + granule_order = 0 + + for htm_file in htm_files: + basename = os.path.splitext(os.path.basename(htm_file))[0] + section = map_section(htm_file) + + meta = mods_metadata.get(basename, {}) + title = meta.get("title", basename) + page_start = meta.get("page_start") + page_end = meta.get("page_end") + + granule = CRECGranule( + crec_issue_id=issue.crec_issue_id, + granule_id=f"{package_id}/{basename}", + section=section, + title=title, + page_start=page_start, + page_end=page_end, + order_number=granule_order, + ) + session.add(granule) + session.flush() + granule_order += 1 + + try: + content = zf.read(htm_file).decode("utf-8", errors="replace") + except Exception as e: + logger.warning(f"Failed to read {htm_file}: {e}") + continue + + speech_segments = parse_htm_content(content) + + for seg in speech_segments: + speaker_raw = seg["speaker_raw"] + + if speaker_raw in speaker_cache: + bioguide_id = speaker_cache[speaker_raw] + else: + bioguide_id = resolve_speaker(speaker_raw, section, session) + speaker_cache[speaker_raw] = bioguide_id + + content_text = seg["content_text"] + word_count = len(content_text.split()) + + speech = CRECSpeech( + crec_granule_id=granule.crec_granule_id, + speaker_raw=speaker_raw or None, + legislator_bioguide_id=bioguide_id, + order_number=seg["order_number"], + content_text=content_text, + word_count=word_count, + ) + session.add(speech) + session.flush() + + bill_refs = extract_bill_references(content_text) + seen_refs = set() + for ref in bill_refs: + ref_key = (ref["chamber"], ref["number"], ref["legislation_type"]) + if ref_key in seen_refs: + continue + seen_refs.add(ref_key) + + legislation_id = resolve_bill_reference(ref, congress_id, session) + + bill_reference = CRECBillReference( + crec_speech_id=speech.crec_speech_id, + legislation_id=legislation_id, + cite_text=ref["cite_text"], + cite_type=ref["cite_type"], + start_offset=ref["start"], + end_offset=ref["end"], + ) + session.add(bill_reference) + + session.commit() + logger.info(f"Imported {package_id}: {granule_order} granules") + + +def run_import(start_date: date = None, end_date: date = None): + """Run the Congressional Record import for a date range.""" + db = Session() + + congress_number = calculate_congress_from_year() + congress = db.query(Congress).filter( + Congress.session_number == congress_number + ).first() + + if not congress: + logger.error(f"Congress {congress_number} not found in database") + return + + congress_id = congress.congress_id + + if start_date is None: + latest = db.query(CRECIssue).order_by( + CRECIssue.issue_date.desc() + ).first() + if latest: + start_date = latest.issue_date + timedelta(days=1) + else: + start_date = get_congress_start_date(congress_number) + + if end_date is None: + end_date = date.today() + + logger.info(f"Importing Congressional Record from {start_date} to {end_date}") + + current = start_date + while current <= end_date: + if current.weekday() < 5: + try: + import_daily_record(current, db, congress_id) + except Exception as e: + logger.error(f"Error importing {current}: {e}", exc_info=True) + db.rollback() + current += timedelta(days=1) + + db.close() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + run_import() diff --git a/backend/congress_parser/prompt_runners/crec_summarizer.py b/backend/congress_parser/prompt_runners/crec_summarizer.py new file mode 100644 index 00000000..953bc471 --- /dev/null +++ b/backend/congress_parser/prompt_runners/crec_summarizer.py @@ -0,0 +1,257 @@ +""" +Congressional Record debate summarizer. + +Generates LLM summaries at two levels: + 1. Granule-level: Summarizes each debate/segment individually + 2. Daily-level: Aggregates granule summaries into per-chamber daily overviews + +Follows the section_summarizer.py pattern with PromptBatch tracking. +""" + +import json +import logging +from datetime import datetime +from typing import List + +from congress_db.models import ( + CRECGranule, + CRECIssue, + CRECSpeech, + CRECSummary, + PromptBatch, +) +from congress_db.session import Session +from congress_parser.prompt_runners.utils import run_query + +logger = logging.getLogger(__name__) + +GRANULE_SUMMARY_PROMPT = """Summarize the following Congressional debate transcript. +Identify: +- The key topics discussed +- Which legislators spoke and their positions +- Any bills or legislation referenced +- Key arguments made for and against + +Keep the summary to 2-3 concise paragraphs. Output JSON with a "summary" key. + +TRANSCRIPT: +{transcript}""" + +DAILY_SUMMARY_PROMPT = """Summarize the following Congressional Record summaries for a single day. +These are summaries of individual debates and proceedings from the {chamber}. + +Provide a cohesive overview of the day's activities in 2-3 paragraphs. Output JSON with a "summary" key. + +SUMMARIES: +{summaries}""" + + +def build_transcript(speeches: List[CRECSpeech]) -> str: + """Build a formatted transcript from speech segments.""" + lines = [] + for speech in speeches: + speaker = speech.speaker_raw or "UNKNOWN" + lines.append(f"{speaker}: {speech.content_text}") + return "\n\n".join(lines) + + +def summarize_granule(granule_id: int, model: str = "ollama/gemma2:27b") -> str: + """Generate a summary for a single granule (debate segment).""" + with Session() as session: + speeches = ( + session.query(CRECSpeech) + .filter(CRECSpeech.crec_granule_id == granule_id) + .order_by(CRECSpeech.order_number) + .all() + ) + + if not speeches: + return None + + transcript = build_transcript(speeches) + + if len(transcript) < 100: + return None + + # Truncate very long transcripts to fit context + if len(transcript) > 50000: + transcript = transcript[:50000] + "\n\n[TRUNCATED]" + + prompt = GRANULE_SUMMARY_PROMPT.format(transcript=transcript) + + try: + response = run_query( + prompt, + model=model, + num_ctx=32768, + json=True, + max_tokens=2000, + ) + content = response.choices[0].message.content + parsed = json.loads(content) + return parsed.get("summary", content) + except Exception as e: + logger.error(f"Failed to summarize granule {granule_id}: {e}") + return None + + +def summarize_daily_chamber( + issue_id: int, chamber: str, granule_summaries: List[str], + model: str = "ollama/gemma2:27b", +) -> str: + """Generate a daily chamber-level summary from granule summaries.""" + if not granule_summaries: + return None + + combined = "\n\n---\n\n".join(granule_summaries) + + if len(combined) > 30000: + combined = combined[:30000] + "\n\n[TRUNCATED]" + + prompt = DAILY_SUMMARY_PROMPT.format( + chamber=chamber, + summaries=combined, + ) + + try: + response = run_query( + prompt, + model=model, + num_ctx=32768, + json=True, + max_tokens=2000, + ) + content = response.choices[0].message.content + parsed = json.loads(content) + return parsed.get("summary", content) + except Exception as e: + logger.error(f"Failed to summarize daily {chamber} for issue {issue_id}: {e}") + return None + + +def run_crec_summarizer(prompt_id: int = None, model: str = "ollama/gemma2:27b"): + """ + Summarize all unsummarized Congressional Record granules and daily issues. + """ + with Session() as session: + # Find granules without summaries + summarized_granule_ids = ( + session.query(CRECSummary.crec_granule_id) + .filter( + CRECSummary.crec_granule_id.isnot(None), + CRECSummary.summary_type == "granule", + ) + .subquery() + ) + + unsummarized_granules = ( + session.query(CRECGranule) + .filter(CRECGranule.crec_granule_id.notin_( + session.query(summarized_granule_ids) + )) + .order_by(CRECGranule.crec_granule_id.desc()) + .all() + ) + + logger.info(f"Found {len(unsummarized_granules)} unsummarized granules") + + prompt_batch = PromptBatch( + prompt_id=prompt_id, + legislation_version_id=None, + attempted=0, + successful=0, + failed=0, + skipped=0, + created_at=datetime.now(), + ) + session.add(prompt_batch) + session.commit() + + for granule in unsummarized_granules: + prompt_batch.attempted += 1 + + summary_text = summarize_granule(granule.crec_granule_id, model=model) + if summary_text: + summary = CRECSummary( + crec_granule_id=granule.crec_granule_id, + crec_issue_id=granule.crec_issue_id, + summary=summary_text, + summary_type="granule", + prompt_batch_id=prompt_batch.prompt_batch_id, + ) + session.add(summary) + prompt_batch.successful += 1 + else: + prompt_batch.skipped += 1 + + session.commit() + + # Now generate daily summaries for issues with granule summaries but no daily summary + summarized_issue_ids = ( + session.query(CRECSummary.crec_issue_id) + .filter( + CRECSummary.crec_issue_id.isnot(None), + CRECSummary.summary_type == "daily", + ) + .subquery() + ) + + issues_needing_daily = ( + session.query(CRECIssue) + .filter(CRECIssue.crec_issue_id.notin_( + session.query(summarized_issue_ids) + )) + .order_by(CRECIssue.issue_date.desc()) + .all() + ) + + for issue in issues_needing_daily: + granule_summaries_by_chamber = {} + granule_summaries = ( + session.query(CRECSummary) + .join(CRECGranule, CRECSummary.crec_granule_id == CRECGranule.crec_granule_id) + .filter( + CRECSummary.crec_issue_id == issue.crec_issue_id, + CRECSummary.summary_type == "granule", + ) + .all() + ) + + for gs in granule_summaries: + granule = session.query(CRECGranule).get(gs.crec_granule_id) + if granule: + chamber = granule.section.value if granule.section else "Unknown" + if chamber not in granule_summaries_by_chamber: + granule_summaries_by_chamber[chamber] = [] + granule_summaries_by_chamber[chamber].append(gs.summary) + + for chamber, summaries in granule_summaries_by_chamber.items(): + prompt_batch.attempted += 1 + daily_text = summarize_daily_chamber( + issue.crec_issue_id, chamber, summaries, model=model, + ) + if daily_text: + daily_summary = CRECSummary( + crec_issue_id=issue.crec_issue_id, + summary=daily_text, + summary_type="daily", + prompt_batch_id=prompt_batch.prompt_batch_id, + ) + session.add(daily_summary) + prompt_batch.successful += 1 + else: + prompt_batch.skipped += 1 + + session.commit() + + prompt_batch.completed_at = datetime.now() + session.commit() + logger.info( + f"Summarization complete: {prompt_batch.successful} successful, " + f"{prompt_batch.failed} failed, {prompt_batch.skipped} skipped" + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + run_crec_summarizer() diff --git a/backend/congress_parser/utils/cite_parser.py b/backend/congress_parser/utils/cite_parser.py index 81036aaa..74565474 100644 --- a/backend/congress_parser/utils/cite_parser.py +++ b/backend/congress_parser/utils/cite_parser.py @@ -59,6 +59,78 @@ ) SUB_OF_REGEX = re.compile(r"sub(?:section)?\s\((.)\)", re.IGNORECASE) +# Congressional bill reference patterns for detecting citations in debate text +# Matches: H.R. 1234, S. 567, H.J.Res. 12, S.Con.Res. 5, H.Res. 89, etc. +BILL_REF_REGEX = re.compile( + r"(?P" + r"H\.?\s*R\.?" + r"|S\.?\s*J\.?\s*Res\.?" + r"|H\.?\s*J\.?\s*Res\.?" + r"|S\.?\s*Con\.?\s*Res\.?" + r"|H\.?\s*Con\.?\s*Res\.?" + r"|H\.?\s*Res\.?" + r"|S\.?\s*Res\.?" + r"|S\.?" + r")\s*(?P\d+)", + re.IGNORECASE, +) + +# Map from citation prefix patterns to (chamber, legislation_type) tuples +BILL_PREFIX_MAP = { + "hr": ("House", "Bill"), + "s": ("Senate", "Bill"), + "hjres": ("House", "Joint Resolution"), + "sjres": ("Senate", "Joint Resolution"), + "hconres": ("House", "Continuing Resolution"), + "sconres": ("Senate", "Continuing Resolution"), + "hres": ("House", "Resolution"), + "sres": ("Senate", "Resolution"), +} + + +class BillRefObject(TypedDict): + cite_text: str + chamber: str + legislation_type: str + number: int + cite_type: str + start: int + end: int + + +def _normalize_prefix(prefix: str) -> str: + """Normalize a bill prefix like 'H. R.' or 'H.J. Res.' to a lookup key.""" + return re.sub(r"[\s.]", "", prefix).lower() + + +def extract_bill_references(text: str) -> List[BillRefObject]: + """ + Extract Congressional bill references from text (e.g. debate transcripts). + + Returns a list of BillRefObject dicts with cite_text, chamber, legislation_type, + number, start/end character offsets, and cite_type="bill". + """ + results: List[BillRefObject] = [] + for match in BILL_REF_REGEX.finditer(text): + prefix = _normalize_prefix(match.group("prefix")) + number = int(match.group("number")) + + mapping = BILL_PREFIX_MAP.get(prefix) + if mapping is None: + continue + + chamber, leg_type = mapping + results.append({ + "cite_text": match.group(0), + "chamber": chamber, + "legislation_type": leg_type, + "number": number, + "cite_type": "bill", + "start": match.start(), + "end": match.end(), + }) + return results + def find_extra_clause_references(snippet): """ diff --git a/hillstack/prisma/schema.prisma b/hillstack/prisma/schema.prisma index de61f112..9a7d0494 100644 --- a/hillstack/prisma/schema.prisma +++ b/hillstack/prisma/schema.prisma @@ -64,6 +64,7 @@ model prompt_batch { legislation_content_summary legislation_content_summary[] legislation_content_tag legislation_content_tag[] legislation_version_tag legislation_version_tag[] + crec_summary crec_summary[] @@index([legislation_version_id], map: "ix_prompts_prompt_batch_legislation_version_id") @@index([prompt_id], map: "ix_prompts_prompt_batch_prompt_id") @@ -87,6 +88,7 @@ model congress { legislation_vote legislation_vote[] legislative_policy_area legislative_policy_area[] legislative_subject legislative_subject[] + crec_issue crec_issue[] @@schema("public") } @@ -111,6 +113,7 @@ model legislation { legislative_policy_area_association legislative_policy_area_association[] legislative_subject_association legislative_subject_association[] user_legislation user_legislation[] + crec_bill_reference crec_bill_reference[] @@unique([chamber, number, legislation_type, congress_id], map: "unq_bill") @@index([chamber], map: "ix_legislation_chamber") @@ -368,6 +371,7 @@ model legislator { legislation_sponsorship legislation_sponsorship[] legislator_vote legislator_vote[] user_legislator user_legislator[] + crec_speech crec_speech[] @@index([district], map: "ix_legislator_district") @@index([lis_id], map: "ix_legislator_lis_id") @@ -742,6 +746,100 @@ model verification_token { @@schema("authentication") } +model crec_issue { + crec_issue_id Int @id @default(autoincrement()) + issue_date DateTime? @unique(map: "ix_crec_issue_issue_date") @db.Date + congress_id Int? + package_id String? @unique(map: "uq_crec_issue_package_id") @db.VarChar + created_at DateTime? @default(now()) @db.Timestamp(6) + congress congress? @relation(fields: [congress_id], references: [congress_id], onDelete: Cascade, onUpdate: NoAction) + crec_granule crec_granule[] + crec_summary crec_summary[] + + @@index([congress_id], map: "ix_crec_issue_congress_id") + @@schema("public") +} + +model crec_granule { + crec_granule_id Int @id @default(autoincrement()) + crec_issue_id Int? + granule_id String? @unique(map: "uq_crec_granule_granule_id") @db.VarChar + section crecsection? + title String? @db.VarChar + page_start String? @db.VarChar + page_end String? @db.VarChar + order_number Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_issue crec_issue? @relation(fields: [crec_issue_id], references: [crec_issue_id], onDelete: Cascade, onUpdate: NoAction) + crec_speech crec_speech[] + crec_summary crec_summary[] + + @@index([crec_issue_id], map: "ix_crec_granule_crec_issue_id") + @@index([section], map: "ix_crec_granule_section") + @@schema("public") +} + +model crec_speech { + crec_speech_id Int @id @default(autoincrement()) + crec_granule_id Int? + speaker_raw String? @db.VarChar + legislator_bioguide_id String? @db.VarChar + order_number Int? + content_text String? @db.VarChar + word_count Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_granule crec_granule? @relation(fields: [crec_granule_id], references: [crec_granule_id], onDelete: Cascade, onUpdate: NoAction) + legislator legislator? @relation(fields: [legislator_bioguide_id], references: [bioguide_id], onDelete: SetNull, onUpdate: NoAction) + crec_bill_reference crec_bill_reference[] + + @@index([crec_granule_id], map: "ix_crec_speech_crec_granule_id") + @@index([legislator_bioguide_id], map: "ix_crec_speech_legislator_bioguide_id") + @@schema("public") +} + +model crec_bill_reference { + crec_bill_reference_id Int @id @default(autoincrement()) + crec_speech_id Int? + legislation_id Int? + cite_text String? @db.VarChar + cite_type String? @db.VarChar + start_offset Int? + end_offset Int? + crec_speech crec_speech? @relation(fields: [crec_speech_id], references: [crec_speech_id], onDelete: Cascade, onUpdate: NoAction) + legislation legislation? @relation(fields: [legislation_id], references: [legislation_id], onDelete: SetNull, onUpdate: NoAction) + + @@index([crec_speech_id], map: "ix_crec_bill_reference_crec_speech_id") + @@index([legislation_id], map: "ix_crec_bill_reference_legislation_id") + @@schema("public") +} + +model crec_summary { + crec_summary_id Int @id @default(autoincrement()) + crec_granule_id Int? + crec_issue_id Int? + summary String? @db.VarChar + summary_type String? @db.VarChar + prompt_batch_id Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_granule crec_granule? @relation(fields: [crec_granule_id], references: [crec_granule_id], onDelete: Cascade, onUpdate: NoAction) + crec_issue crec_issue? @relation(fields: [crec_issue_id], references: [crec_issue_id], onDelete: Cascade, onUpdate: NoAction) + prompt_batch prompt_batch? @relation(fields: [prompt_batch_id], references: [prompt_batch_id], onDelete: Cascade, onUpdate: NoAction) + + @@index([crec_granule_id], map: "ix_crec_summary_crec_granule_id") + @@index([crec_issue_id], map: "ix_crec_summary_crec_issue_id") + @@index([prompt_batch_id], map: "ix_crec_summary_prompt_batch_id") + @@schema("prompts") +} + +enum crecsection { + Senate + House + Extensions + DailyDigest + + @@schema("public") +} + enum legislationchamber { House Senate diff --git a/hillstack/src/app/congress/bills/[billId]/debates/page.tsx b/hillstack/src/app/congress/bills/[billId]/debates/page.tsx new file mode 100644 index 00000000..045c3d68 --- /dev/null +++ b/hillstack/src/app/congress/bills/[billId]/debates/page.tsx @@ -0,0 +1,162 @@ +'use client'; + +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { useParams } from 'next/navigation'; +import { api } from '~/trpc/react'; + +export default function BillDebatesPage() { + const params = useParams(); + const billId = Number(params.billId); + + const { data, isLoading } = api.congressionalRecord.debatesForBill.useQuery({ + legislationId: billId, + }); + + if (isLoading) { + return ( + + {Array.from({ length: 5 }).map((_, i) => ( + + ))} + + ); + } + + if (!data || data.length === 0) { + return ( + + + No Congressional Record debates found referencing this bill. + + + ); + } + + // Group by granule for better display + const byGranule = new Map(); + + for (const ref of data) { + const speech = ref.crec_speech; + if (!speech?.crec_granule) continue; + const gId = speech.crec_granule.crec_granule_id; + if (!byGranule.has(gId)) { + byGranule.set(gId, { + granuleId: gId, + issueId: speech.crec_granule.crec_issue_id ?? 0, + title: speech.crec_granule.title ?? 'Untitled Debate', + section: speech.crec_granule.section, + issueDate: speech.crec_granule.crec_issue?.issue_date ?? null, + speeches: [], + }); + } + byGranule.get(gId)?.speeches.push(ref); + } + + return ( + + + {data.length} mention{data.length !== 1 ? 's' : ''} found in the Congressional Record + + + {Array.from(byGranule.values()).map((group) => { + const dateStr = group.issueDate + ? new Date(group.issueDate).toLocaleDateString('en-US', { + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : ''; + + return ( + + + + + {group.title} + + + {group.section && ( + + )} + {dateStr && ( + + {dateStr} + + )} + + + {group.speeches.slice(0, 3).map((ref) => { + const speech = ref.crec_speech; + if (!speech) return null; + const leg = speech.legislator; + const name = leg + ? `${leg.first_name ?? ''} ${leg.last_name ?? ''}`.trim() + : speech.speaker_raw ?? 'Unknown'; + + return ( + + + {name[0] ?? '?'} + + + + {name} + {leg?.party ? ` (${leg.party})` : ''} + + + {speech.content_text} + + + + ); + })} + + {group.speeches.length > 3 && ( + + + View all {group.speeches.length} mentions in this debate + + + )} + + ); + })} + + ); +} diff --git a/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx b/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx index 20e74a99..c2abc903 100644 --- a/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx +++ b/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx @@ -19,6 +19,7 @@ import { import type { Params } from 'next/dist/server/request/params'; import Link from 'next/link'; import { LegislatorFollow } from '~/app/congress/legislators/[bioguideId]/follow'; +import { LegislatorSpeakingStats } from '~/components/record/LegislatorSpeakingStats'; import { stateAbbreviations } from '~/constants'; import { api, HydrateClient } from '~/trpc/server'; @@ -363,6 +364,7 @@ export default async function LegislatorPage({ ))} + diff --git a/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx b/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx new file mode 100644 index 00000000..a39950ce --- /dev/null +++ b/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx @@ -0,0 +1,86 @@ +import Box from '@mui/material/Box'; +import Breadcrumbs from '@mui/material/Breadcrumbs'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/server'; +import { HydrateClient } from '~/trpc/server'; +import { SpeechCard } from '~/components/record/SpeechCard'; + +export default async function GranulePage({ + params, +}: { + params: Promise<{ issueId: string; granuleId: string }>; +}) { + const { issueId, granuleId } = await params; + const granule = await api.congressionalRecord.getGranule({ + granuleId: Number(granuleId), + }); + + const summary = granule.crec_summary[0]?.summary; + + return ( + + + + Record + + Daily Issue + + Debate + + + + {granule.title || 'Congressional Record Entry'} + + + + {granule.section && ( + + )} + {granule.page_start && ( + + )} + + + {summary && ( + + + AI Summary + + {summary} + + )} + + + Transcript + + + {granule.crec_speech.map((speech) => ( + + ))} + + {granule.crec_speech.length === 0 && ( + + No speech segments found for this entry. + + )} + + + ); +} diff --git a/hillstack/src/app/congress/record/[issueId]/page.tsx b/hillstack/src/app/congress/record/[issueId]/page.tsx new file mode 100644 index 00000000..325c7211 --- /dev/null +++ b/hillstack/src/app/congress/record/[issueId]/page.tsx @@ -0,0 +1,143 @@ +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Divider from '@mui/material/Divider'; +import Grid from '@mui/material/Grid'; +import Paper from '@mui/material/Paper'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/server'; +import { HydrateClient } from '~/trpc/server'; + +export default async function IssuePage({ + params, +}: { + params: Promise<{ issueId: string }>; +}) { + const { issueId } = await params; + const issue = await api.congressionalRecord.getIssue({ + issueId: Number(issueId), + }); + + const dateStr = issue.issue_date + ? new Date(issue.issue_date).toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : 'Unknown Date'; + + const sections = ['Senate', 'House', 'Extensions', 'DailyDigest'] as const; + const granulesBySection: Record = {}; + for (const g of issue.crec_granule) { + const s = g.section ?? 'Other'; + if (!granulesBySection[s]) granulesBySection[s] = []; + granulesBySection[s].push(g); + } + + return ( + + + + Congressional Record + + + {dateStr} + + + {issue.crec_summary.length > 0 && ( + + + AI Summary + + {issue.crec_summary.map((s, i) => ( + + {s.summary} + + ))} + + )} + + + {sections.map((section) => { + const granules = granulesBySection[section]; + if (!granules || granules.length === 0) return null; + + return ( + + + + {section === 'DailyDigest' + ? 'Daily Digest' + : section === 'Extensions' + ? 'Extensions of Remarks' + : section} + + + {granules.map((g) => ( + + + {g.title || 'Untitled'} + + + {g.page_start && ( + + )} + + + {g.crec_summary[0]?.summary && ( + + {g.crec_summary[0].summary} + + )} + + ))} + + + ); + })} + + + + ); +} diff --git a/hillstack/src/app/congress/record/page.tsx b/hillstack/src/app/congress/record/page.tsx new file mode 100644 index 00000000..651c8e4b --- /dev/null +++ b/hillstack/src/app/congress/record/page.tsx @@ -0,0 +1,99 @@ +'use client'; + +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import List from '@mui/material/List'; +import ListItem from '@mui/material/ListItem'; +import ListItemButton from '@mui/material/ListItemButton'; +import ListItemText from '@mui/material/ListItemText'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +export default function CongressionalRecordPage() { + const { data, isLoading } = api.congressionalRecord.list.useQuery({ + page: 1, + pageSize: 30, + }); + + return ( + + + Congressional Record + + + Browse daily transcripts of Congressional debates, speeches, and proceedings. + + + + {isLoading ? ( + + {Array.from({ length: 10 }).map((_, i) => ( + + ))} + + ) : ( + + {data?.issues?.map((issue) => { + const dateStr = issue.issue_date + ? new Date(issue.issue_date).toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : 'Unknown Date'; + + const sectionCounts: Record = {}; + for (const g of issue.crec_granule) { + const s = g.section ?? 'Other'; + sectionCounts[s] = (sectionCounts[s] ?? 0) + 1; + } + + return ( + + + + {Object.entries(sectionCounts).map( + ([section, count]) => ( + + ), + )} + + } + /> + + + ); + })} + {data?.issues?.length === 0 && ( + + + + )} + + )} + + + ); +} diff --git a/hillstack/src/app/congress/record/stats/page.tsx b/hillstack/src/app/congress/record/stats/page.tsx new file mode 100644 index 00000000..f62d4d53 --- /dev/null +++ b/hillstack/src/app/congress/record/stats/page.tsx @@ -0,0 +1,139 @@ +'use client'; + +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Table from '@mui/material/Table'; +import TableBody from '@mui/material/TableBody'; +import TableCell from '@mui/material/TableCell'; +import TableContainer from '@mui/material/TableContainer'; +import TableHead from '@mui/material/TableHead'; +import TableRow from '@mui/material/TableRow'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +function getPartyColor(party: string | null | undefined): string { + if (!party) return '#9e9e9e'; + const p = party.toLowerCase(); + if (p.startsWith('r')) return '#e53935'; + if (p.startsWith('d')) return '#1e88e5'; + return '#9e9e9e'; +} + +export default function SpeakerStatsPage() { + const { data, isLoading } = api.congressionalRecord.speakerStats.useQuery({ + limit: 50, + page: 1, + }); + + return ( + + + Speaker Statistics + + + Top Congressional speakers by total word count in the Congressional Record. + + + + + + + Rank + Legislator + Party + State + Total Words + Speeches + Avg Words/Speech + + + + {isLoading + ? Array.from({ length: 20 }).map((_, i) => ( + + + + + + + + + + )) + : data?.speakers?.map((speaker, idx) => { + const avg = speaker.speech_count > 0 + ? Math.round(speaker.total_words / speaker.speech_count) + : 0; + + return ( + + {idx + 1} + + + {speaker.image_url ? ( + + ) : ( + + {(speaker.first_name?.[0] ?? '') + + (speaker.last_name?.[0] ?? '')} + + )} + + {speaker.first_name} {speaker.last_name} + + + + + + + {speaker.state ?? '-'} + + {speaker.total_words.toLocaleString()} + + + {speaker.speech_count.toLocaleString()} + + + {avg.toLocaleString()} + + + ); + })} + +
+
+ + {!isLoading && data?.speakers?.length === 0 && ( + + No speaker data available yet. Run the Congressional Record importer first. + + )} +
+ ); +} diff --git a/hillstack/src/components/record/LegislatorSpeakingStats.tsx b/hillstack/src/components/record/LegislatorSpeakingStats.tsx new file mode 100644 index 00000000..7bcf2dd6 --- /dev/null +++ b/hillstack/src/components/record/LegislatorSpeakingStats.tsx @@ -0,0 +1,155 @@ +'use client'; + +import ArticleOutlinedIcon from '@mui/icons-material/ArticleOutlined'; +import Box from '@mui/material/Box'; +import Card from '@mui/material/Card'; +import Chip from '@mui/material/Chip'; +import List from '@mui/material/List'; +import ListItem from '@mui/material/ListItem'; +import ListItemButton from '@mui/material/ListItemButton'; +import Toolbar from '@mui/material/Toolbar'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +export function LegislatorSpeakingStats({ + bioguideId, +}: { + bioguideId: string; +}) { + const { data, isLoading } = api.congressionalRecord.legislatorStats.useQuery({ + bioguideId, + }); + + if (isLoading || !data || data.speechCount === 0) { + return null; + } + + return ( + + + + Congressional Record Activity + + + + + + {data.totalWords.toLocaleString()} + + + Total Words + + + + + {data.speechCount.toLocaleString()} + + + Speeches + + + {data.speechCount > 0 && ( + + + {Math.round(data.totalWords / data.speechCount).toLocaleString()} + + + Avg Words/Speech + + + )} + + + + {data.recentSpeeches.length > 0 && ( + <> + + + Recent Speeches + + + + {data.recentSpeeches.map((speech) => { + const granule = speech.crec_granule; + const issueDate = granule?.crec_issue?.issue_date; + const dateStr = issueDate + ? new Date(issueDate).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + year: 'numeric', + }) + : ''; + + return ( + + + + + + {granule?.title ?? 'Speech'} + + + + + {speech.word_count?.toLocaleString()} words + + + + + ); + })} + + + )} + + ); +} diff --git a/hillstack/src/components/record/SpeechCard.tsx b/hillstack/src/components/record/SpeechCard.tsx new file mode 100644 index 00000000..45ba78c2 --- /dev/null +++ b/hillstack/src/components/record/SpeechCard.tsx @@ -0,0 +1,117 @@ +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Tooltip from '@mui/material/Tooltip'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import type { inferRouterOutputs } from '@trpc/server'; +import type { AppRouter } from '~/server/api/root'; + +type GranuleOutput = inferRouterOutputs['congressionalRecord']['getGranule']; +type SpeechType = GranuleOutput['crec_speech'][number]; + +function getPartyColor(party: string | null | undefined): string { + if (!party) return 'default'; + const p = party.toLowerCase(); + if (p.startsWith('r')) return '#e53935'; + if (p.startsWith('d')) return '#1e88e5'; + return '#9e9e9e'; +} + +export function SpeechCard({ speech }: { speech: SpeechType }) { + const legislator = speech.legislator; + const speakerName = legislator + ? `${legislator.first_name ?? ''} ${legislator.last_name ?? ''}`.trim() + : speech.speaker_raw || 'Unknown Speaker'; + + const partyColor = getPartyColor(legislator?.party); + + return ( + + + {legislator?.image_url ? ( + + ) : ( + + {speakerName[0] ?? '?'} + + )} + + + {speech.legislator_bioguide_id ? ( + + + {speakerName} + + + ) : ( + + {speakerName} + + )} + {legislator && ( + + {legislator.party ?? ''} - {legislator.state ?? ''} + + )} + + + {speech.word_count != null && ( + + )} + + + + {speech.content_text} + + + {speech.crec_bill_reference.length > 0 && ( + + {speech.crec_bill_reference.map((ref) => ( + + {ref.legislation_id ? ( + + ) : ( + + )} + + ))} + + )} + + ); +} diff --git a/hillstack/src/constants/navigation.tsx b/hillstack/src/constants/navigation.tsx index e38664ee..83ef9387 100644 --- a/hillstack/src/constants/navigation.tsx +++ b/hillstack/src/constants/navigation.tsx @@ -1,4 +1,5 @@ import AccountBalanceOutlinedIcon from '@mui/icons-material/AccountBalanceOutlined'; +import ArticleOutlinedIcon from '@mui/icons-material/ArticleOutlined'; import AutoGraphIcon from '@mui/icons-material/AutoGraph'; import ChevronLeftOutlinedIcon from '@mui/icons-material/ChevronLeftOutlined'; import Diversity2Icon from '@mui/icons-material/Diversity2'; @@ -48,9 +49,15 @@ export const congressTabs: NavigationTabs = { icon: , label: 'Insights', }, + '/congress/record': { + id: 6, + icon: , + label: 'Record', + }, }; import DifferenceIcon from '@mui/icons-material/Difference'; +import ForumIcon from '@mui/icons-material/Forum'; import LocalAtmIcon from '@mui/icons-material/LocalAtm'; import ManageSearchIcon from '@mui/icons-material/ManageSearch'; import SmartButtonIcon from '@mui/icons-material/SmartButton'; @@ -82,5 +89,10 @@ export const congressBillTabs = ({ icon: , label: 'Spending', }, + [`/congress/bills/${params.billId}/debates`]: { + id: 4, + icon: , + label: 'Debates', + }, }; }; diff --git a/hillstack/src/server/api/root.ts b/hillstack/src/server/api/root.ts index c2ba5ca0..af51a3fd 100644 --- a/hillstack/src/server/api/root.ts +++ b/hillstack/src/server/api/root.ts @@ -1,4 +1,5 @@ import { billRouter } from '~/server/api/routers/bill'; +import { congressionalRecordRouter } from '~/server/api/routers/congressionalRecord'; import { statsRouter } from '~/server/api/routers/stats'; import { userRouter } from '~/server/api/routers/user'; import { createCallerFactory, createTRPCRouter } from '~/server/api/trpc'; @@ -12,6 +13,7 @@ import { legislatorRouter } from './routers/legislator'; */ export const appRouter = createTRPCRouter({ bill: billRouter, + congressionalRecord: congressionalRecordRouter, legislator: legislatorRouter, committee: committeeRouter, stats: statsRouter, diff --git a/hillstack/src/server/api/routers/congressionalRecord.ts b/hillstack/src/server/api/routers/congressionalRecord.ts new file mode 100644 index 00000000..610ae3e3 --- /dev/null +++ b/hillstack/src/server/api/routers/congressionalRecord.ts @@ -0,0 +1,407 @@ +import { z } from 'zod'; +import { createTRPCRouter, publicProcedure } from '~/server/api/trpc'; + +export const congressionalRecordRouter = createTRPCRouter({ + list: publicProcedure + .input( + z.object({ + page: z.number().default(1), + pageSize: z.number().default(20), + startDate: z.string().optional(), + endDate: z.string().optional(), + }), + ) + .query(async ({ input, ctx }) => { + const { page, pageSize, startDate, endDate } = input; + + const where: Record = {}; + if (startDate || endDate) { + where.issue_date = {}; + if (startDate) (where.issue_date as Record).gte = new Date(startDate); + if (endDate) (where.issue_date as Record).lte = new Date(endDate); + } + + const [issues, totalResults] = await Promise.all([ + ctx.db.crec_issue.findMany({ + select: { + crec_issue_id: true, + issue_date: true, + congress_id: true, + package_id: true, + crec_granule: { + select: { + crec_granule_id: true, + section: true, + }, + }, + }, + where, + orderBy: { issue_date: 'desc' }, + skip: (page - 1) * pageSize, + take: pageSize, + }), + ctx.db.crec_issue.count({ where }), + ]); + + return { issues, totalResults }; + }), + + getIssue: publicProcedure + .input(z.object({ issueId: z.number() })) + .query(async ({ input, ctx }) => { + const issue = await ctx.db.crec_issue.findUniqueOrThrow({ + select: { + crec_issue_id: true, + issue_date: true, + congress_id: true, + package_id: true, + crec_granule: { + select: { + crec_granule_id: true, + granule_id: true, + section: true, + title: true, + page_start: true, + page_end: true, + order_number: true, + crec_summary: { + select: { + summary: true, + }, + where: { + summary_type: 'granule', + }, + take: 1, + }, + _count: { + select: { + crec_speech: true, + }, + }, + }, + orderBy: { order_number: 'asc' }, + }, + crec_summary: { + select: { + summary: true, + summary_type: true, + }, + where: { + summary_type: 'daily', + }, + }, + }, + where: { crec_issue_id: input.issueId }, + }); + + return issue; + }), + + getGranule: publicProcedure + .input(z.object({ granuleId: z.number() })) + .query(async ({ input, ctx }) => { + const granule = await ctx.db.crec_granule.findUniqueOrThrow({ + select: { + crec_granule_id: true, + crec_issue_id: true, + granule_id: true, + section: true, + title: true, + page_start: true, + page_end: true, + crec_speech: { + select: { + crec_speech_id: true, + speaker_raw: true, + legislator_bioguide_id: true, + order_number: true, + content_text: true, + word_count: true, + legislator: { + select: { + first_name: true, + last_name: true, + party: true, + state: true, + image_url: true, + }, + }, + crec_bill_reference: { + select: { + crec_bill_reference_id: true, + legislation_id: true, + cite_text: true, + cite_type: true, + start_offset: true, + end_offset: true, + legislation: { + select: { + legislation_id: true, + title: true, + number: true, + chamber: true, + }, + }, + }, + }, + }, + orderBy: { order_number: 'asc' }, + }, + crec_summary: { + select: { + summary: true, + }, + where: { + summary_type: 'granule', + }, + take: 1, + }, + }, + where: { crec_granule_id: input.granuleId }, + }); + + return granule; + }), + + speakerStats: publicProcedure + .input( + z.object({ + startDate: z.string().optional(), + endDate: z.string().optional(), + chamber: z.string().optional(), + limit: z.number().default(20), + page: z.number().default(1), + }), + ) + .query(async ({ input, ctx }) => { + const { startDate, endDate, chamber, limit, page } = input; + + // Build date filter for the join through granule -> issue + const granuleWhere: Record = {}; + if (chamber) { + granuleWhere.section = chamber; + } + if (startDate || endDate) { + const dateFilter: Record = {}; + if (startDate) dateFilter.gte = new Date(startDate); + if (endDate) dateFilter.lte = new Date(endDate); + granuleWhere.crec_issue = { issue_date: dateFilter }; + } + + // Use raw SQL for aggregation since Prisma doesn't support groupBy with sum well + const dateConditions: string[] = []; + const params: unknown[] = []; + let paramIdx = 1; + + if (startDate) { + dateConditions.push(`ci.issue_date >= $${paramIdx}::date`); + params.push(startDate); + paramIdx++; + } + if (endDate) { + dateConditions.push(`ci.issue_date <= $${paramIdx}::date`); + params.push(endDate); + paramIdx++; + } + if (chamber) { + dateConditions.push(`cg.section = $${paramIdx}::text`); + params.push(chamber); + paramIdx++; + } + + const whereClause = dateConditions.length > 0 + ? `AND ${dateConditions.join(' AND ')}` + : ''; + + const offset = (page - 1) * limit; + params.push(limit, offset); + + const results = await ctx.db.$queryRawUnsafe>( + `SELECT + cs.legislator_bioguide_id as bioguide_id, + l.first_name, + l.last_name, + l.party, + l.state, + l.image_url, + SUM(cs.word_count)::bigint as total_words, + COUNT(cs.crec_speech_id)::bigint as speech_count + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + JOIN legislator l ON cs.legislator_bioguide_id = l.bioguide_id + WHERE cs.legislator_bioguide_id IS NOT NULL ${whereClause} + GROUP BY cs.legislator_bioguide_id, l.first_name, l.last_name, l.party, l.state, l.image_url + ORDER BY total_words DESC + LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`, + ...params, + ); + + const countResult = await ctx.db.$queryRawUnsafe>( + `SELECT COUNT(DISTINCT cs.legislator_bioguide_id)::bigint as count + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + WHERE cs.legislator_bioguide_id IS NOT NULL ${whereClause}`, + ...params.slice(0, params.length - 2), + ); + + return { + speakers: results.map((r) => ({ + ...r, + total_words: Number(r.total_words), + speech_count: Number(r.speech_count), + })), + totalResults: Number(countResult[0]?.count ?? 0), + }; + }), + + activityCalendar: publicProcedure + .input( + z.object({ + startDate: z.string().optional(), + endDate: z.string().optional(), + }).optional(), + ) + .query(async ({ input, ctx }) => { + const where: Record = {}; + if (input?.startDate || input?.endDate) { + where.issue_date = {}; + if (input?.startDate) (where.issue_date as Record).gte = new Date(input.startDate); + if (input?.endDate) (where.issue_date as Record).lte = new Date(input.endDate); + } + + const issues = await ctx.db.crec_issue.findMany({ + select: { + issue_date: true, + _count: { + select: { + crec_granule: true, + }, + }, + }, + where, + orderBy: { issue_date: 'asc' }, + }); + + return issues.map((i) => ({ + date: i.issue_date, + count: i._count.crec_granule, + })); + }), + + // For bill detail page - get debates referencing a specific bill + debatesForBill: publicProcedure + .input(z.object({ legislationId: z.number() })) + .query(async ({ input, ctx }) => { + const references = await ctx.db.crec_bill_reference.findMany({ + select: { + cite_text: true, + crec_speech: { + select: { + crec_speech_id: true, + speaker_raw: true, + legislator_bioguide_id: true, + content_text: true, + word_count: true, + legislator: { + select: { + first_name: true, + last_name: true, + party: true, + }, + }, + crec_granule: { + select: { + crec_granule_id: true, + crec_issue_id: true, + title: true, + section: true, + crec_issue: { + select: { + issue_date: true, + }, + }, + }, + }, + }, + }, + }, + where: { + legislation_id: input.legislationId, + }, + take: 50, + }); + + return references; + }), + + // For legislator page - get speaking stats for a specific legislator + legislatorStats: publicProcedure + .input(z.object({ bioguideId: z.string() })) + .query(async ({ input, ctx }) => { + const stats = await ctx.db.$queryRawUnsafe>( + `SELECT + SUM(cs.word_count)::bigint as total_words, + COUNT(cs.crec_speech_id)::bigint as speech_count, + MIN(ci.issue_date) as first_date, + MAX(ci.issue_date) as last_date + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + WHERE cs.legislator_bioguide_id = $1`, + input.bioguideId, + ); + + const recentSpeeches = await ctx.db.crec_speech.findMany({ + select: { + crec_speech_id: true, + content_text: true, + word_count: true, + crec_granule: { + select: { + crec_granule_id: true, + crec_issue_id: true, + title: true, + section: true, + crec_issue: { + select: { + issue_date: true, + }, + }, + }, + }, + }, + where: { + legislator_bioguide_id: input.bioguideId, + }, + orderBy: { + crec_speech_id: 'desc', + }, + take: 10, + }); + + const stat = stats[0]; + return { + totalWords: Number(stat?.total_words ?? 0), + speechCount: Number(stat?.speech_count ?? 0), + firstDate: stat?.first_date ?? null, + lastDate: stat?.last_date ?? null, + recentSpeeches, + }; + }), +}); From 3d970e8ce4ca37da4e7e23c9d65689392563e9b7 Mon Sep 17 00:00:00 2001 From: Bradley Goodyear Date: Tue, 31 Mar 2026 21:30:23 -0400 Subject: [PATCH 2/2] Switch CREC importer to PDF download and fix migration enum conflict - Replace ZIP bulk data URL with govinfo.gov PDF URL format - Use pdfminer.six to extract text from daily Congressional Record PDFs - Split text into sections (Senate/House/Extensions/DailyDigest) and granules - Fix Alembic migration to use DO block for idempotent enum creation - Add pdfminer.six to requirements.txt Co-Authored-By: Claude Sonnet 4.6 --- .../b2c3d4e5f6a7_congressional_record.py | 11 +- .../importers/congressional_record.py | 361 +++++++++--------- backend/requirements.txt | 1 + 3 files changed, 181 insertions(+), 192 deletions(-) diff --git a/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py index cd7789b5..9ac3fee6 100644 --- a/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py +++ b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py @@ -9,6 +9,7 @@ from alembic import op import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ENUM revision: str = 'b2c3d4e5f6a7' down_revision: Union[str, None] = 'a1b2c3d4e5f6' @@ -17,9 +18,11 @@ def upgrade() -> None: - # Create CRECSection enum type - crec_section = sa.Enum('Senate', 'House', 'Extensions', 'DailyDigest', name='crecsection') - crec_section.create(op.get_bind(), checkfirst=True) + # Create CRECSection enum type only if it doesn't exist + op.execute("DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crecsection') THEN CREATE TYPE crecsection AS ENUM ('Senate', 'House', 'Extensions', 'DailyDigest'); END IF; END $$;") + + # Use create_type=False so SQLAlchemy doesn't try to re-create the existing enum + crec_section_ref = ENUM('Senate', 'House', 'Extensions', 'DailyDigest', name='crecsection', create_type=False) op.create_table( 'crec_issue', @@ -38,7 +41,7 @@ def upgrade() -> None: sa.Column('crec_granule_id', sa.Integer(), primary_key=True, autoincrement=True), sa.Column('crec_issue_id', sa.Integer(), sa.ForeignKey('crec_issue.crec_issue_id', ondelete='CASCADE'), nullable=True), sa.Column('granule_id', sa.String(), nullable=True), - sa.Column('section', crec_section, nullable=True), + sa.Column('section', crec_section_ref, nullable=True), sa.Column('title', sa.String(), nullable=True), sa.Column('page_start', sa.String(), nullable=True), sa.Column('page_end', sa.String(), nullable=True), diff --git a/backend/congress_parser/importers/congressional_record.py b/backend/congress_parser/importers/congressional_record.py index 4ceeb8a2..c0061b86 100644 --- a/backend/congress_parser/importers/congressional_record.py +++ b/backend/congress_parser/importers/congressional_record.py @@ -1,17 +1,15 @@ """ Congressional Record importer. -Downloads and parses daily Congressional Record bulk data from govinfo.gov. -Each daily issue is a ZIP containing HTML/XML files organized by section -(Senate, House, Extensions of Remarks, Daily Digest). +Downloads and parses daily Congressional Record PDFs from govinfo.gov. -Bulk data URL pattern: - https://www.govinfo.gov/bulkdata/CREC/{year}/{month:02d}/{day:02d}/CREC-{year}-{month:02d}-{day:02d}.zip +PDF URL pattern: + https://www.govinfo.gov/content/pkg/CREC-{year}-{month:02d}-{day:02d}/pdf/CREC-{year}-{month:02d}-{day:02d}.pdf The importer: 1. Iterates through dates in the current Congress (119th, starting Jan 2025) - 2. Downloads daily ZIP packages - 3. Parses HTML/XML granules to extract debate segments + 2. Downloads daily PDF + 3. Extracts text and splits by section (Senate, House, Extensions, Daily Digest) 4. Identifies speakers and resolves them to legislator bioguide IDs 5. Extracts bill references from speech text 6. Stores everything in the crec_* tables @@ -23,13 +21,11 @@ from datetime import datetime, date, timedelta import io import logging -import os import re import requests -import tempfile -import zipfile -from lxml import etree, html +from pdfminer.high_level import extract_text_to_fp +from pdfminer.layout import LAParams from unidecode import unidecode from congress_db.session import Session @@ -45,11 +41,23 @@ LegislationChamber, LegislationType, ) + from congress_parser.utils.cite_parser import extract_bill_references logger = logging.getLogger(__name__) -BULK_DATA_URL = "https://www.govinfo.gov/bulkdata/CREC/{year}/{month:02d}/{day:02d}/CREC-{year}-{month:02d}-{day:02d}.zip" +PDF_URL = "https://www.govinfo.gov/content/pkg/CREC-{year}-{month:02d}-{day:02d}/pdf/CREC-{year}-{month:02d}-{day:02d}.pdf" + +# Section header patterns in the PDF +SECTION_HEADERS = { + re.compile(r"^\s*SENATE\s*$", re.MULTILINE): CRECSection.Senate, + re.compile(r"^\s*HOUSE OF REPRESENTATIVES\s*$", re.MULTILINE): CRECSection.House, + re.compile(r"^\s*EXTENSIONS OF REMARKS\s*$", re.MULTILINE): CRECSection.Extensions, + re.compile(r"^\s*DAILY DIGEST\s*$", re.MULTILINE): CRECSection.DailyDigest, +} + +# Heading pattern: all-caps line that marks a new topic/granule +HEADING_PATTERN = re.compile(r"^([A-Z][A-Z0-9 \-\'\.,]{4,})$") def calculate_congress_from_year() -> int: @@ -169,111 +177,113 @@ def resolve_bill_reference(ref: dict, congress_id: int, session) -> int: return legislation.legislation_id if legislation else None -def parse_htm_content(content: str) -> list: +def extract_pdf_text(pdf_bytes: bytes) -> str: + """Extract plain text from PDF bytes using pdfminer.""" + output = io.StringIO() + laparams = LAParams(line_margin=0.5, word_margin=0.1) + extract_text_to_fp(io.BytesIO(pdf_bytes), output, laparams=laparams, output_type="text", codec="utf-8") + return output.getvalue() + + +def split_pdf_into_sections(text: str) -> list: """ - Parse an HTML granule file and extract speech segments. - Returns a list of dicts: {speaker_raw, content_text, order_number} + Split raw PDF text into sections (Senate, House, Extensions, DailyDigest). + Returns a list of dicts: {section: CRECSection, text: str} """ - try: - doc = html.fromstring(content) - except Exception: - return [] + # Find all section header positions + positions = [] + for pattern, section in SECTION_HEADERS.items(): + for m in pattern.finditer(text): + positions.append((m.start(), section, m.end())) - speeches = [] - current_speaker = "" - current_paragraphs = [] - order = 0 + if not positions: + return [{"section": CRECSection.Senate, "text": text}] - paragraphs = doc.xpath("//body//p") or doc.xpath("//p") - if not paragraphs: - text = doc.text_content().strip() - if text: - speeches.append({ - "speaker_raw": "", - "content_text": text, - "order_number": 0, - }) - return speeches + positions.sort(key=lambda x: x[0]) + sections = [] + for i, (start, section, end) in enumerate(positions): + section_text = text[end: positions[i + 1][0] if i + 1 < len(positions) else len(text)] + sections.append({"section": section, "text": section_text.strip()}) - for p in paragraphs: - text = p.text_content().strip() - if not text: - continue + return sections - text = unidecode(text) - speaker = extract_speaker_name(text) - if speaker and speaker != current_speaker: - if current_paragraphs: - full_text = "\n".join(current_paragraphs) - speeches.append({ - "speaker_raw": current_speaker, - "content_text": full_text, - "order_number": order, - }) - order += 1 - current_speaker = speaker - current_paragraphs = [text] +def split_section_into_granules(section_text: str, section: CRECSection) -> list: + """ + Split a section's text into granules (topics) based on all-caps headings. + Returns a list of dicts: {title: str, text: str} + """ + lines = section_text.split("\n") + granules = [] + current_title = "General" + current_lines = [] + + for line in lines: + stripped = line.strip() + if not stripped: + current_lines.append(line) + continue + if HEADING_PATTERN.match(stripped) and len(stripped) > 5: + if current_lines and any(l.strip() for l in current_lines): + granules.append({"title": current_title, "text": "\n".join(current_lines).strip()}) + current_title = stripped.title() + current_lines = [] else: - current_paragraphs.append(text) + current_lines.append(line) - if current_paragraphs: - full_text = "\n".join(current_paragraphs) - speeches.append({ - "speaker_raw": current_speaker, - "content_text": full_text, - "order_number": order, - }) + if current_lines and any(l.strip() for l in current_lines): + granules.append({"title": current_title, "text": "\n".join(current_lines).strip()}) - return speeches + return granules if granules else [{"title": "General", "text": section_text}] -def parse_mods_xml(mods_content: str) -> dict: +def parse_granule_speeches(granule_text: str) -> list: """ - Parse the MODS XML metadata file for a daily CREC package. - Returns a dict of granule_id -> {title, section, page_start, page_end}. + Parse a granule's text into speech segments by speaker. + Returns a list of dicts: {speaker_raw, content_text, order_number} """ - try: - root = etree.fromstring(mods_content) - except Exception: - return {} - - ns = {"mods": "http://www.loc.gov/mods/v3"} - granules = {} - - for related in root.findall(".//mods:relatedItem[@type='constituent']", ns): - identifier_el = related.find("mods:identifier[@type='preferred citation']", ns) - title_el = related.find("mods:titleInfo/mods:title", ns) + speeches = [] + current_speaker = "" + current_lines = [] + order = 0 - if identifier_el is None: + for line in granule_text.split("\n"): + text = unidecode(line.strip()) + if not text: + if current_lines: + current_lines.append("") continue - granule_id = identifier_el.text.strip() if identifier_el.text else None - title = title_el.text.strip() if title_el is not None and title_el.text else "" - - page_start = None - page_end = None - extent_el = related.find("mods:part/mods:extent", ns) - if extent_el is not None: - start_el = extent_el.find("mods:start", ns) - end_el = extent_el.find("mods:end", ns) - if start_el is not None and start_el.text: - page_start = start_el.text.strip() - if end_el is not None and end_el.text: - page_end = end_el.text.strip() + speaker = extract_speaker_name(text) + if speaker and speaker != current_speaker: + if current_lines: + full_text = "\n".join(current_lines).strip() + if full_text: + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) + order += 1 + current_speaker = speaker + current_lines = [text] + else: + current_lines.append(text) - if granule_id: - granules[granule_id] = { - "title": title, - "page_start": page_start, - "page_end": page_end, - } + if current_lines: + full_text = "\n".join(current_lines).strip() + if full_text: + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) - return granules + return speeches def import_daily_record(issue_date: date, session, congress_id: int): - """Import a single day's Congressional Record.""" + """Import a single day's Congressional Record from the PDF.""" package_id = f"CREC-{issue_date.isoformat()}" existing = session.query(CRECIssue).filter( @@ -283,7 +293,7 @@ def import_daily_record(issue_date: date, session, congress_id: int): logger.info(f"Skipping {package_id} - already imported") return - url = BULK_DATA_URL.format( + url = PDF_URL.format( year=issue_date.year, month=issue_date.month, day=issue_date.day, @@ -291,7 +301,7 @@ def import_daily_record(issue_date: date, session, congress_id: int): logger.info(f"Downloading {url}") try: - resp = requests.get(url, timeout=60) + resp = requests.get(url, timeout=120) except requests.RequestException as e: logger.warning(f"Failed to download {url}: {e}") return @@ -303,6 +313,13 @@ def import_daily_record(issue_date: date, session, congress_id: int): logger.warning(f"Unexpected status {resp.status_code} for {url}") return + logger.info(f"Extracting text from {package_id} PDF ({len(resp.content)} bytes)") + try: + full_text = extract_pdf_text(resp.content) + except Exception as e: + logger.warning(f"Failed to extract PDF text for {package_id}: {e}") + return + issue = CRECIssue( issue_date=issue_date, congress_id=congress_id, @@ -311,106 +328,73 @@ def import_daily_record(issue_date: date, session, congress_id: int): session.add(issue) session.flush() - try: - zf = zipfile.ZipFile(io.BytesIO(resp.content)) - except zipfile.BadZipFile: - logger.warning(f"Bad ZIP file for {package_id}") - session.rollback() - return - - mods_metadata = {} - for name in zf.namelist(): - if name.endswith("mods.xml"): - try: - mods_content = zf.read(name) - mods_metadata = parse_mods_xml(mods_content) - except Exception as e: - logger.warning(f"Failed to parse MODS {name}: {e}") - break - - htm_files = sorted([ - n for n in zf.namelist() - if n.endswith(".htm") or n.endswith(".html") - ]) - speaker_cache = {} granule_order = 0 - for htm_file in htm_files: - basename = os.path.splitext(os.path.basename(htm_file))[0] - section = map_section(htm_file) - - meta = mods_metadata.get(basename, {}) - title = meta.get("title", basename) - page_start = meta.get("page_start") - page_end = meta.get("page_end") - - granule = CRECGranule( - crec_issue_id=issue.crec_issue_id, - granule_id=f"{package_id}/{basename}", - section=section, - title=title, - page_start=page_start, - page_end=page_end, - order_number=granule_order, - ) - session.add(granule) - session.flush() - granule_order += 1 - - try: - content = zf.read(htm_file).decode("utf-8", errors="replace") - except Exception as e: - logger.warning(f"Failed to read {htm_file}: {e}") - continue - - speech_segments = parse_htm_content(content) - - for seg in speech_segments: - speaker_raw = seg["speaker_raw"] - - if speaker_raw in speaker_cache: - bioguide_id = speaker_cache[speaker_raw] - else: - bioguide_id = resolve_speaker(speaker_raw, section, session) - speaker_cache[speaker_raw] = bioguide_id - - content_text = seg["content_text"] - word_count = len(content_text.split()) - - speech = CRECSpeech( - crec_granule_id=granule.crec_granule_id, - speaker_raw=speaker_raw or None, - legislator_bioguide_id=bioguide_id, - order_number=seg["order_number"], - content_text=content_text, - word_count=word_count, + sections = split_pdf_into_sections(full_text) + for sec in sections: + section = sec["section"] + granules = split_section_into_granules(sec["text"], section) + + for gran in granules: + granule = CRECGranule( + crec_issue_id=issue.crec_issue_id, + granule_id=f"{package_id}/{section.value}/{granule_order}", + section=section, + title=gran["title"], + order_number=granule_order, ) - session.add(speech) + session.add(granule) session.flush() - - bill_refs = extract_bill_references(content_text) - seen_refs = set() - for ref in bill_refs: - ref_key = (ref["chamber"], ref["number"], ref["legislation_type"]) - if ref_key in seen_refs: - continue - seen_refs.add(ref_key) - - legislation_id = resolve_bill_reference(ref, congress_id, session) - - bill_reference = CRECBillReference( - crec_speech_id=speech.crec_speech_id, - legislation_id=legislation_id, - cite_text=ref["cite_text"], - cite_type=ref["cite_type"], - start_offset=ref["start"], - end_offset=ref["end"], + granule_order += 1 + + speech_segments = parse_granule_speeches(gran["text"]) + for seg in speech_segments: + speaker_raw = seg["speaker_raw"] + + if speaker_raw in speaker_cache: + bioguide_id = speaker_cache[speaker_raw] + else: + bioguide_id = resolve_speaker(speaker_raw, section, session) + speaker_cache[speaker_raw] = bioguide_id + + content_text = seg["content_text"] + word_count = len(content_text.split()) + + speech = CRECSpeech( + crec_granule_id=granule.crec_granule_id, + speaker_raw=speaker_raw or None, + legislator_bioguide_id=bioguide_id, + order_number=seg["order_number"], + content_text=content_text, + word_count=word_count, ) - session.add(bill_reference) + session.add(speech) + session.flush() + + bill_refs = extract_bill_references(content_text) + seen_refs = set() + for ref in bill_refs: + ref_key = (ref["chamber"], ref["number"], ref["legislation_type"]) + if ref_key in seen_refs: + continue + seen_refs.add(ref_key) + + legislation_id = resolve_bill_reference(ref, congress_id, session) + + bill_reference = CRECBillReference( + crec_speech_id=speech.crec_speech_id, + legislation_id=legislation_id, + cite_text=ref["cite_text"], + cite_type=ref["cite_type"], + start_offset=ref["start"], + end_offset=ref["end"], + ) + session.add(bill_reference) session.commit() logger.info(f"Imported {package_id}: {granule_order} granules") + return True def run_import(start_date: date = None, end_date: date = None): @@ -447,6 +431,7 @@ def run_import(start_date: date = None, end_date: date = None): if current.weekday() < 5: try: import_daily_record(current, db, congress_id) + except Exception as e: logger.error(f"Error importing {current}: {e}", exc_info=True) db.rollback() diff --git a/backend/requirements.txt b/backend/requirements.txt index 1f456e5a..f2c28db8 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -36,6 +36,7 @@ chromadb~=0.6.3 # Parsing tools lxml==6.0.2 +pdfminer.six==20231228 genson==1.3.0 jsonschema==4.25.1 pandas==2.3.3