Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
*.xml
*.pyc
.env/
.venv/
private/
./idea
23 changes: 16 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# StackOverflow data to postgres

This is a quick script to move the Stackoverflow data from the [StackExchange
data dump (Sept '14)](https://archive.org/details/stackexchange) to a Postgres
data dump (Sept '25)](https://archive.org/details/stackexchange_20250930) to a Postgres
SQL database.

Schema hints are taken from [a post on
Expand All @@ -10,21 +10,30 @@ and from [StackExchange Data Explorer](http://data.stackexchange.com).

## Quickstart

Install requirements, create a new database (e.g. `beerSO` below), and use `load_into_pg.py` script:
### Using uv (Recommended)

Install dependencies with [uv](https://github.com/astral-sh/uv), create a new database (e.g. `beerSO` below) or let the script create it for you, and run the script:

``` console
$ uv sync
$ createdb beerSO
$ uv run python load_into_pg.py -s beer -d beerSO
```

### Using pip

Alternatively, you can use pip:

``` console
$ pip install -r requirements.txt
...
Successfully installed argparse-1.2.1 libarchive-c-2.9 lxml-4.5.2 psycopg2-binary-2.8.4 six-1.10.0
$ createdb beerSO
$ python load_into_pg.py -s beer -d beerSO
```

This will download compressed files from
[archive.org](https://ia800107.us.archive.org/27/items/stackexchange/) and load
[archive.org](https://archive.org/download/stackexchange_20250930/stackexchange_20250930/) and load
all the tables at once.


## Advanced Usage

You can use a custom database name as well. Make sure to explicitly give it
Expand All @@ -46,7 +55,7 @@ You can download manually the files to the folder from where the program is
executed: `Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`. In
some old dumps, the cases in the filenames are different.

Then load each file with e.g. `python load_into_pg.py -t Badges`.
Then load each file with e.g. `uv run python load_into_pg.py -t Badges`.

After all the initial tables have been created:

Expand Down
206 changes: 187 additions & 19 deletions load_into_pg.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
#!/usr/bin/env python

import argparse
import getpass
import json
import os
import sys
import time
import argparse
from typing import Any

import psycopg2 as pg
import os
import row_processor as Processor
from psycopg2 import sql
import six
import json

import row_processor as Processor

# Special rules needed for certain tables (esp. for old database dumps)
specialRules = {("Posts", "ViewCount"): "NULLIF(%(ViewCount)s, '')::int"}

# part of the file already downloaded
file_part = None
_resolved_password = None
_password_loaded = False


def show_progress(block_num, block_size, total_size):
Expand Down Expand Up @@ -46,35 +53,162 @@ def show_progress(block_num, block_size, total_size):
file_part = None
six.print_("")


def getConnectionParameters():
"""Get the parameters for the connection to the database."""

parameters = {}

if args.dbname:
parameters['dbname'] = args.dbname
parameters["dbname"] = args.dbname

if args.host:
parameters['host'] = args.host
parameters["host"] = args.host

if args.port:
parameters['port'] = args.port
parameters["port"] = args.port

if args.username:
parameters['user'] = args.username
parameters["user"] = args.username

if args.password:
parameters['password'] = args.password
password = _resolvePassword()
if password:
parameters["password"] = password

if args.schema_name:
parameters['options'] = "-c search_path=" + args.schema_name
sslmode = args.sslmode
if not sslmode and args.host and args.host.endswith(".neon.tech"):
# Neon pooler requires TLS. Enable it automatically for Neon hosts.
sslmode = "require"

if sslmode:
parameters["sslmode"] = sslmode

return parameters


def _resolvePassword():
"""Resolve password from CLI/env/file/prompt exactly once.

If no password source is configured, libpq defaults are used, which include
~/.pgpass and environment variables supported by libpq.
"""
global _resolved_password, _password_loaded

if _password_loaded:
return _resolved_password

_password_loaded = True

if args.password:
_resolved_password = args.password
return _resolved_password

if args.password_env:
_resolved_password = os.getenv(args.password_env)
if _resolved_password is None:
raise RuntimeError(
"Password environment variable '{}' is not set.".format(
args.password_env
)
)
return _resolved_password

if args.password_file:
try:
with open(args.password_file) as f:
_resolved_password = f.readline().rstrip("\r\n")
except OSError as e:
raise RuntimeError(
"Could not read password file '{}': {}".format(
args.password_file, str(e)
)
)
return _resolved_password

if args.prompt_password:
_resolved_password = getpass.getpass("PostgreSQL password: ")
return _resolved_password

return None


def ensureDatabaseExists():
"""Create the database if it does not exist.

First tries to connect to the target database directly. If it exists, we
continue. If it does not exist, connect to the 'postgres' database and
create it.
"""
params = getConnectionParameters()
target_dbname = params.get("dbname", "stackoverflow")

# Fast path: database already exists and is accessible.
try:
with pg.connect(**params):
six.print_("Database '{}' already exists.".format(target_dbname))
return
except pg.Error as e:
# 3D000 = invalid_catalog_name (database does not exist)
if e.pgcode != "3D000":
six.print_(
"Error checking/creating database: {}".format(str(e)), file=sys.stderr
)
raise

admin_params = dict(params)
admin_params["dbname"] = "postgres"

conn = None
try:
# Connect to postgres database to create target database
conn = pg.connect(**admin_params)
# Set autocommit for CREATE DATABASE (cannot run in transaction)
conn.autocommit = True

with conn.cursor() as cur:
six.print_(
"Database '{}' does not exist. Creating...".format(target_dbname)
)
# Create database with proper quoting to avoid SQL injection
cur.execute(
sql.SQL("CREATE DATABASE {}").format(sql.Identifier(target_dbname))
)
six.print_("Database '{}' created successfully.".format(target_dbname))
except pg.Error as e:
six.print_(
"Error checking/creating database: {}".format(str(e)), file=sys.stderr
)
raise
finally:
if conn is not None:
conn.close()


def _makeDefValues(keys):
"""Returns a dictionary containing None for all keys."""
return dict(((k, None) for k in keys))
return {k: None for k in keys}


def _setSearchPath(cur):
"""Apply schema search_path after connect.

Some managed poolers (e.g. Neon pooler) reject startup options, so we set
search_path with SQL after the connection has been established.
"""
if not args.schema_name:
return

schemas = [
schema.strip() for schema in args.schema_name.split(",") if schema.strip()
]
if not schemas:
return

cur.execute(
sql.SQL("SET search_path TO {}").format(
sql.SQL(", ").join([sql.Identifier(schema) for schema in schemas])
)
)


def _createMogrificationTemplate(table, keys, insertJson):
Expand All @@ -100,7 +234,7 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
`cursor` is used to mogrify the data and the `templ` is the template used
for the mogrification.
"""
defs = _makeDefValues(keys)
defs: dict[str, Any] = _makeDefValues(keys)
defs.update(attribs)

if insertJson:
Expand Down Expand Up @@ -202,6 +336,8 @@ def handleTable(table, insertJson, createFk, mbDbFile):
with pg.connect(**getConnectionParameters()) as conn:
with conn.cursor() as cur:
try:
_setSearchPath(cur)

with open(dbFile, "rb") as xml:
# Pre-processing (dropping/creation of tables)
six.print_("Pre-processing ...")
Expand Down Expand Up @@ -279,6 +415,7 @@ def handleTable(table, insertJson, createFk, mbDbFile):
six.print_("Warning from the database.", file=sys.stderr)
six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)


#############################################################

parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -317,7 +454,7 @@ def handleTable(table, insertJson, createFk, mbDbFile):
parser.add_argument(
"--archive-url",
help="URL of the archive directory to retrieve.",
default="https://ia800107.us.archive.org/27/items/stackexchange",
default="https://archive.org/download/stackexchange_20250930/stackexchange_20250930",
)

parser.add_argument(
Expand All @@ -330,7 +467,29 @@ def handleTable(table, insertJson, createFk, mbDbFile):

parser.add_argument("-u", "--username", help="Username for the database.", default=None)

parser.add_argument("-p", "--password", help="Password for the database.", default=None)
password_group = parser.add_mutually_exclusive_group()
password_group.add_argument(
"-p",
"--password",
help="Password for the database (less secure; visible in shell history).",
default=None,
)
password_group.add_argument(
"--password-env",
help="Environment variable name containing the database password.",
default=None,
)
password_group.add_argument(
"--password-file",
help="Read the database password from the first line of a file.",
default=None,
)
password_group.add_argument(
"--prompt-password",
help="Prompt for database password without echo.",
action="store_true",
default=False,
)

parser.add_argument(
"-P", "--port", help="Port to connect with the database on.", default=None
Expand All @@ -357,6 +516,13 @@ def handleTable(table, insertJson, createFk, mbDbFile):
"-n", "--schema-name", help="Use specific schema.", default="public"
)

parser.add_argument(
"--sslmode",
help="SSL mode for PostgreSQL connection.",
choices=["disable", "allow", "prefer", "require", "verify-ca", "verify-full"],
default=None,
)

parser.add_argument(
"--foreign-keys", help="Create foreign keys.", action="store_true", default=False
)
Expand All @@ -365,10 +531,13 @@ def handleTable(table, insertJson, createFk, mbDbFile):

try:
# Python 2/3 compatibility
input = raw_input
input = raw_input # type: ignore[name-defined]
except NameError:
pass

# Ensure database exists before proceeding
ensureDatabaseExists()

# load given file in table
if args.file and args.table:
table = args.table
Expand All @@ -381,8 +550,7 @@ def handleTable(table, insertJson, createFk, mbDbFile):
choice = input("This will drop the {} table. Are you sure [y/n]?".format(table))

if len(choice) > 0 and choice[0].lower() == "y":
handleTable(
table, args.insert_json, args.foreign_keys, args.file)
handleTable(table, args.insert_json, args.foreign_keys, args.file)
else:
six.print_("Cancelled.")

Expand Down
21 changes: 21 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "stackexchange-dump-to-postgres"
version = "0.1.0"
description = "Load StackExchange data dumps into PostgreSQL"
readme = "README.md"
requires-python = ">=3.8"
license = { text = "MIT" }
dependencies = [
"libarchive-c>=2.9",
"lxml>=4.9.0",
"psycopg2-binary>=2.9.0",
"six>=1.16.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["."]
only-include = ["load_into_pg.py", "row_processor.py", "sql/*.sql"]
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
argparse==1.2.1
libarchive-c==2.9
lxml==4.6.3
libarchive-c==4.0
lxml==4.9.1
psycopg2-binary==2.8.4
six==1.10.0
Loading