|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import os |
| 4 | +import tempfile |
| 5 | +import zipfile |
| 6 | +from urllib.parse import quote_plus |
| 7 | + |
| 8 | +from dotenv import load_dotenv |
| 9 | +from google.cloud import storage |
| 10 | +from google.oauth2 import service_account |
| 11 | +from bson import ObjectId |
| 12 | +from pymongo import MongoClient |
| 13 | + |
| 14 | + |
| 15 | +def build_mongo_uri(address: str, username: str, password: str) -> str: |
| 16 | + address = address.strip() |
| 17 | + if address.startswith("mongodb://") or address.startswith("mongodb+srv://"): |
| 18 | + return address |
| 19 | + if "@" in address: |
| 20 | + return f"mongodb://{address}" |
| 21 | + user = quote_plus(username) |
| 22 | + pwd = quote_plus(password) |
| 23 | + return f"mongodb://{user}:{pwd}@{address}" |
| 24 | + |
| 25 | + |
| 26 | +def guess_extension(content_type: str) -> str: |
| 27 | + if not content_type: |
| 28 | + return "" |
| 29 | + mapping = { |
| 30 | + "application/pdf": ".pdf", |
| 31 | + "application/msword": ".doc", |
| 32 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", |
| 33 | + "image/png": ".png", |
| 34 | + "image/jpeg": ".jpg", |
| 35 | + } |
| 36 | + return mapping.get(content_type, "") |
| 37 | + |
| 38 | + |
| 39 | +def safe_name(value: str) -> str: |
| 40 | + cleaned = "".join( |
| 41 | + ch for ch in value.strip() if ch.isalnum() or ch in ("-", "_") |
| 42 | + ) |
| 43 | + return cleaned or "unknown" |
| 44 | + |
| 45 | + |
| 46 | +def main() -> int: |
| 47 | + load_dotenv(override=True) |
| 48 | + parser = argparse.ArgumentParser( |
| 49 | + description="Download all hacker resumes from GCS and bundle into a zip." |
| 50 | + ) |
| 51 | + parser.add_argument( |
| 52 | + "--out", |
| 53 | + default="resumes.zip", |
| 54 | + help="Output zip path (default: resumes.zip).", |
| 55 | + ) |
| 56 | + parser.add_argument( |
| 57 | + "--debug", |
| 58 | + action="store_true", |
| 59 | + help="Enable verbose logging for troubleshooting.", |
| 60 | + ) |
| 61 | + args = parser.parse_args() |
| 62 | + |
| 63 | + address = os.environ.get("DB_ADDRESS_DEPLOY") |
| 64 | + username = os.environ.get("DB_USER_DEPLOY") |
| 65 | + password = os.environ.get("DB_PASS_DEPLOY") |
| 66 | + bucket_name = os.environ.get("BUCKET_NAME") |
| 67 | + |
| 68 | + missing = [k for k, v in { |
| 69 | + "DB_ADDRESS_DEPLOY": address, |
| 70 | + "DB_USER_DEPLOY": username, |
| 71 | + "DB_PASS_DEPLOY": password, |
| 72 | + "BUCKET_NAME": bucket_name, |
| 73 | + }.items() if not v] |
| 74 | + if missing: |
| 75 | + raise SystemExit(f"Missing required env vars: {', '.join(missing)}") |
| 76 | + |
| 77 | + gcs_env = { |
| 78 | + "TYPE": os.environ.get("TYPE"), |
| 79 | + "PROJECT_ID": os.environ.get("PROJECT_ID"), |
| 80 | + "PRIVATE_KEY_ID": os.environ.get("PRIVATE_KEY_ID"), |
| 81 | + "PRIVATE_KEY": os.environ.get("PRIVATE_KEY"), |
| 82 | + "CLIENT_EMAIL": os.environ.get("CLIENT_EMAIL"), |
| 83 | + "CLIENT_ID": os.environ.get("CLIENT_ID"), |
| 84 | + "AUTH_URI": os.environ.get("AUTH_URI"), |
| 85 | + "TOKEN_URI": os.environ.get("TOKEN_URI"), |
| 86 | + "AUTH_PROVIDER_X509_CERT_URL": os.environ.get("AUTH_PROVIDER_X509_CERT_URL"), |
| 87 | + "CLIENT_X509_CERT_URL": os.environ.get("CLIENT_X509_CERT_URL"), |
| 88 | + } |
| 89 | + missing_gcs = [k for k, v in gcs_env.items() if not v] |
| 90 | + if missing_gcs: |
| 91 | + raise SystemExit(f"Missing required GCS env vars: {', '.join(missing_gcs)}") |
| 92 | + |
| 93 | + mongo_uri = build_mongo_uri(address, username, password) |
| 94 | + if args.debug: |
| 95 | + print(f"Mongo URI: {mongo_uri}") |
| 96 | + client = MongoClient(mongo_uri) |
| 97 | + db_name = "hackboard-deploy" |
| 98 | + db = client[db_name] |
| 99 | + hackers = db["hackers"] |
| 100 | + accounts = db["accounts"] |
| 101 | + account_cache = {} |
| 102 | + if args.debug: |
| 103 | + print(f"Database: {db_name}") |
| 104 | + print(f"Collections: {', '.join(sorted(db.list_collection_names()))}") |
| 105 | + print(f"Hackers count: {hackers.count_documents({})}") |
| 106 | + |
| 107 | + query = {"application.general.URL.resume": {"$exists": True, "$ne": ""}} |
| 108 | + projection = {"application.general.URL.resume": 1, "accountId": 1} |
| 109 | + cursor = hackers.find(query, projection=projection) |
| 110 | + if args.debug: |
| 111 | + match_count = hackers.count_documents(query) |
| 112 | + print(f"Resume query matches: {match_count}") |
| 113 | + |
| 114 | + private_key = gcs_env["PRIVATE_KEY"] |
| 115 | + if private_key and "\\n" in private_key: |
| 116 | + private_key = private_key.replace("\\n", "\n") |
| 117 | + |
| 118 | + credentials_info = { |
| 119 | + "type": gcs_env["TYPE"], |
| 120 | + "project_id": gcs_env["PROJECT_ID"], |
| 121 | + "private_key_id": gcs_env["PRIVATE_KEY_ID"], |
| 122 | + "private_key": private_key, |
| 123 | + "client_email": gcs_env["CLIENT_EMAIL"], |
| 124 | + "client_id": gcs_env["CLIENT_ID"], |
| 125 | + "auth_uri": gcs_env["AUTH_URI"], |
| 126 | + "token_uri": gcs_env["TOKEN_URI"], |
| 127 | + "auth_provider_x509_cert_url": gcs_env["AUTH_PROVIDER_X509_CERT_URL"], |
| 128 | + "client_x509_cert_url": gcs_env["CLIENT_X509_CERT_URL"], |
| 129 | + } |
| 130 | + credentials = service_account.Credentials.from_service_account_info( |
| 131 | + credentials_info |
| 132 | + ) |
| 133 | + storage_client = storage.Client( |
| 134 | + project=credentials_info["project_id"], credentials=credentials |
| 135 | + ) |
| 136 | + bucket = storage_client.bucket(bucket_name) |
| 137 | + |
| 138 | + total = 0 |
| 139 | + downloaded = 0 |
| 140 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 141 | + for doc in cursor: |
| 142 | + total += 1 |
| 143 | + resume_path = ( |
| 144 | + doc.get("application", {}) |
| 145 | + .get("general", {}) |
| 146 | + .get("URL", {}) |
| 147 | + .get("resume", "") |
| 148 | + ) |
| 149 | + if not resume_path: |
| 150 | + if args.debug: |
| 151 | + print(f"Skip {doc.get('_id')}: missing resume path") |
| 152 | + continue |
| 153 | + |
| 154 | + blob = bucket.blob(resume_path) |
| 155 | + if not blob.exists(): |
| 156 | + if args.debug: |
| 157 | + print(f"Missing blob: {resume_path}") |
| 158 | + continue |
| 159 | + |
| 160 | + account_id = doc.get("accountId") |
| 161 | + if isinstance(account_id, dict) and "$oid" in account_id: |
| 162 | + account_id = account_id["$oid"] |
| 163 | + if isinstance(account_id, str): |
| 164 | + try: |
| 165 | + account_id = ObjectId(account_id) |
| 166 | + except Exception: |
| 167 | + pass |
| 168 | + account = {} |
| 169 | + if account_id in account_cache: |
| 170 | + account = account_cache[account_id] |
| 171 | + elif account_id is not None: |
| 172 | + account = accounts.find_one({"_id": account_id}) or {} |
| 173 | + if not account and isinstance(account_id, ObjectId): |
| 174 | + account = ( |
| 175 | + accounts.find_one({"_id": str(account_id)}) or {} |
| 176 | + ) |
| 177 | + account_cache[account_id] = account |
| 178 | + |
| 179 | + first = safe_name(str(account.get("firstName", ""))) |
| 180 | + last = safe_name(str(account.get("lastName", ""))) |
| 181 | + |
| 182 | + if args.debug: |
| 183 | + print("Names: ", first, last) |
| 184 | + |
| 185 | + if first == "unknown" and last == "unknown": |
| 186 | + name_stub = str(doc["_id"]) |
| 187 | + else: |
| 188 | + name_stub = f"{first}_{last}" |
| 189 | + |
| 190 | + basename = os.path.basename(resume_path) |
| 191 | + ext = os.path.splitext(basename)[1] |
| 192 | + if not ext: |
| 193 | + blob.reload() |
| 194 | + ext = guess_extension(blob.content_type) |
| 195 | + |
| 196 | + local_name = ( |
| 197 | + f"{name_stub}_resume{ext if ext else ''}" |
| 198 | + if name_stub != str(doc["_id"]) |
| 199 | + else f"{doc['_id']}__{basename}{ext if ext else ''}" |
| 200 | + ) |
| 201 | + local_path = os.path.join(tmpdir, local_name) |
| 202 | + |
| 203 | + if args.debug: |
| 204 | + print(f"Downloading {resume_path} -> {local_name}") |
| 205 | + with open(local_path, "wb") as fh: |
| 206 | + fh.write(blob.download_as_bytes()) |
| 207 | + downloaded += 1 |
| 208 | + |
| 209 | + with zipfile.ZipFile(args.out, "w", compression=zipfile.ZIP_DEFLATED) as zf: |
| 210 | + for name in os.listdir(tmpdir): |
| 211 | + path = os.path.join(tmpdir, name) |
| 212 | + zf.write(path, arcname=name) |
| 213 | + |
| 214 | + print(f"Processed {total} hackers, downloaded {downloaded} resumes.") |
| 215 | + print(f"Wrote {args.out}.") |
| 216 | + return 0 |
| 217 | + |
| 218 | + |
| 219 | +if __name__ == "__main__": |
| 220 | + raise SystemExit(main()) |
0 commit comments