From 02bcb22c5ce339bc031ec4308e730dacbd79f329 Mon Sep 17 00:00:00 2001 From: Jan Jambor Date: Thu, 22 May 2025 12:00:54 +0200 Subject: [PATCH] new: Script to transform Azure DevOps export csv to an importable csv for workitems. --- README.md | 38 +++++++++++ reorder_requirements.py | 141 ++++++++++++++++++++++++++++++++++++++++ requirements.pip | 1 + 3 files changed, 180 insertions(+) create mode 100644 README.md create mode 100644 reorder_requirements.py create mode 100644 requirements.pip diff --git a/README.md b/README.md new file mode 100644 index 0000000..e12ff57 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Python Azure DevOps Work Item CSV + +## Run Example + +```bash +# 1 Create and activate an isolated Python environment +python3 -m venv .venv +source .venv/bin/activate + +# 2 Install runtime dependencies +python -m pip install --upgrade pip +python -m pip install -r requirements.pip + +# 3 Convert the CSV +python reorder_requirements.py "input.csv" "output.csv" + +python reorder_requirements.py -v "/Users/jj/Downloads/input.csv" "/Users/jj/Downloads/output.csv" +``` + +## Execution Logic Summary + +1. **build\_level\_map** walks up the `Parent` chain to assign every row its depth (1, 2, 3, …). +1. **depth\_first\_order** produces a depth-first sequence of IDs so that each parent is immediately followed by its children (and grandchildren). +1. **restructure** creates the empty `ID` column plus the mutually-exclusive `Title 1 – 3` columns and appends every *other* original column (except the old `ID`, `Parent`, and `Title`). +1. The script writes the new CSV; nothing is printed except a final “Wrote … rows” confirmation. + +You’ll get a file whose first four columns are: + +| ID | Title 1 | Title 2 | Title 3 | …other original fields… | +| -- | ------- | ------- | ------- | ----------------------- | + +where: + +* Top-level items fill **Title 1**. +* Second-level items fill **Title 2**. +* Third-level (and deeper) items fill **Title 3**. + +All hierarchy constraints you specified are enforced automatically. Feel free to adapt the column names or add CLI switches if you ever need variations (e.g., different max depth). diff --git a/reorder_requirements.py b/reorder_requirements.py new file mode 100644 index 0000000..035708f --- /dev/null +++ b/reorder_requirements.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Re-orders a hierarchical CSV and rewrites it for Azure DevOps Boards import. + +Adds: + • Rich INFO-level logging of every major step + • A sanity-check ensuring the output is + 1) complete - same number of rows, no extras or losses + 2) correct - every non-hierarchy cell identical to the input + 3) functional - has required columns for ADO import, in the right order +""" + +from __future__ import annotations + +import argparse +import hashlib +import logging +from collections import defaultdict +import pandas as pd + +def _hash_row(row: pd.Series) -> str: + """Stable hash of a Series - used to compare multi-column equality regardless of order.""" + txt = "||".join(str(v) for v in row.tolist()) + return hashlib.md5(txt.encode("utf-8")).hexdigest() + + +def build_level_map(df: pd.DataFrame) -> dict[int, int]: + log = logging.getLogger("hierarchy") + level_cache: dict[int, int] = {} + + def level_of(_id: int) -> int: + if _id in level_cache: + return level_cache[_id] + parent = df.loc[df["ID"] == _id, "Parent"].iloc[0] + if pd.isna(parent): + level_cache[_id] = 1 + else: + level_cache[_id] = 1 + level_of(int(parent)) + return level_cache[_id] + + for _id in df["ID"]: + level_of(int(_id)) + log.info("Calculated depth for %d items", len(level_cache)) + return level_cache + + +def depth_first_order(df: pd.DataFrame) -> list[int]: + children: defaultdict[int, list[int]] = defaultdict(list) + for _, row in df.iterrows(): + if not pd.isna(row["Parent"]): + children[int(row["Parent"])].append(int(row["ID"])) + + ordered: list[int] = [] + + def visit(node_id: int) -> None: + ordered.append(node_id) + for child_id in children.get(node_id, []): + visit(child_id) + + for root_id in df[pd.isna(df["Parent"])]["ID"]: + visit(int(root_id)) + logging.getLogger("hierarchy").info("Produced depth-first order of %d IDs", len(ordered)) + return ordered + + +def restructure(df: pd.DataFrame) -> pd.DataFrame: + level_map = build_level_map(df) + ordered_ids = depth_first_order(df) + + df_sorted = df.set_index("ID").loc[ordered_ids].reset_index() + df_sorted["_level"] = df_sorted["ID"].map(level_map) + + # New ADO-style columns + df_sorted.insert(0, "ID_new", "") + df_sorted["Title 1"] = df_sorted.apply(lambda r: r["Title"] if r["_level"] == 1 else "", axis=1) + df_sorted["Title 2"] = df_sorted.apply(lambda r: r["Title"] if r["_level"] == 2 else "", axis=1) + df_sorted["Title 3"] = df_sorted.apply(lambda r: r["Title"] if r["_level"] >= 3 else "", axis=1) + + cols_to_drop = {"ID", "Parent", "Title", "_level"} + other_cols = [c for c in df_sorted.columns if c not in cols_to_drop | {"ID_new", "Title 1", "Title 2", "Title 3"}] + + final_df = df_sorted[["ID_new", "Title 1", "Title 2", "Title 3", *other_cols]] + final_df = final_df.rename(columns={"ID_new": "ID"}) + logging.getLogger("transform").info("Restructured to %d columns (%s)", len(final_df.columns), ", ".join(final_df.columns)) + return final_df, other_cols + + +def sanity_check(df_in: pd.DataFrame, df_out: pd.DataFrame, other_cols: list[str]) -> None: + log = logging.getLogger("check") + + # 1) complete ────────────────────────────────────────────────────────────── + if len(df_in) != len(df_out): + raise ValueError(f"Row count mismatch – input:{len(df_in)} vs output:{len(df_out)}") + log.info("Completeness ✔ %d rows", len(df_in)) + + # If there are no extra data-bearing columns, skip the correctness test + if not other_cols: + log.info("Correctness ✔ (no non-hierarchy columns present)") + else: + # 2) correct ─────────────────────────────────────────────────────────── + in_hashes = df_in[other_cols].apply(_hash_row, axis=1).value_counts().sort_index() + out_hashes = df_out[other_cols].apply(_hash_row, axis=1).value_counts().sort_index() + + if not in_hashes.equals(out_hashes): + diff = (in_hashes - out_hashes).replace(0, pd.NA).dropna() + raise ValueError(f"Data mismatch detected in {len(diff)} row(s) – hashes differ") + log.info("Correctness ✔ all %d non-hierarchy cells identical", len(df_in) * len(other_cols)) + + # 3) functional ─────────────────────────────────────────────────────────── + required_cols = ["ID", "Title 1", "Title 2", "Title 3"] + if df_out.columns.tolist()[:4] != required_cols: + raise ValueError("Required ADO columns missing or out of order") + if not df_out["ID"].eq("").all(): + raise ValueError("The first column 'ID' must be empty for ADO import") + + log.info("Functional ✔ output format matches Azure DevOps Boards import requirements") + +def main() -> None: + argp = argparse.ArgumentParser(description="Re-order hierarchical CSV for ADO Boards") + argp.add_argument("input_csv") + argp.add_argument("output_csv") + argp.add_argument("-v", "--verbose", action="store_true", help="log INFO messages") + args = argp.parse_args() + + logging.basicConfig( + level=logging.INFO if args.verbose else logging.WARNING, + format="%(levelname)s %(name)s: %(message)s", + ) + + logging.info("Reading %s …", args.input_csv) + df_in = pd.read_csv(args.input_csv) + + df_out, other_cols = restructure(df_in) + sanity_check(df_in, df_out, other_cols) + + df_out.to_csv(args.output_csv, index=False) + logging.info("Success - wrote %d rows to %s", len(df_out), args.output_csv) + + +if __name__ == "__main__": + main() diff --git a/requirements.pip b/requirements.pip new file mode 100644 index 0000000..fba4c52 --- /dev/null +++ b/requirements.pip @@ -0,0 +1 @@ +pandas>=2.2,<3.0