# ==========================================================
# 1. ENVIRONMENT SETUP
# ==========================================================
# This cell ensures all dependencies are installed correctly even on a fresh system.

%pip install --quiet --upgrade pip
%pip install --quiet pandas numpy matplotlib scikit-learn seaborn jupyterlab notebook

import os, re, math, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

pd.set_option("display.max_columns", 120)
sns.set_style("whitegrid")

# Project directory (edit if different)
DATA_DIR = "./train"
OUT_MERGED = "./merged_train.csv"
TARGET = "amount_new_house_transaction"
TIME_COL = "month"
SECTOR_COL = "sector"
RANDOM_STATE = 42

print("Environment successfully prepared.")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Environment successfully prepared.

# ==========================================================
# 2. LOAD AND MERGE TRAINING DATA
# ==========================================================

RAW_FILES = {
    "new": "new_house_transactions.csv",
    "new_near": "new_house_transactions_nearby_sectors.csv",
    "pre": "pre_owned_house_transactions.csv",
    "pre_near": "pre_owned_house_transactions_nearby_sectors.csv",
    "land": "land_transactions.csv",
    "land_near": "land_transactions_nearby_sectors.csv",
    "poi": "sector_POI.csv",
}

def parse_month_col(df, col="month"):
    """Convert mixed Chinese/English month formats into datetime (YYYY-MM)."""
    if col not in df.columns:
        return df
    df[col] = df[col].astype(str).str.replace(r"_sector\s*\d+$", "", regex=True).str.strip()
    parsed = pd.to_datetime(df[col], errors="coerce")
    mask = parsed.isna() & df[col].str.contains("年", na=False)
    if mask.any():
        def cn_parse(s):
            m = re.match(r"^(\d{4})年(\d{1,2})月", str(s))
            if m:
                return pd.Timestamp(year=int(m.group(1)), month=int(m.group(2)), day=1)
            return pd.NaT
        parsed.loc[mask] = df.loc[mask, col].apply(cn_parse)
    df[col] = parsed.dt.to_period("M").dt.to_timestamp()
    return df

def load_csv(path):
    df = pd.read_csv(path)
    if "month" in df.columns:
        df = parse_month_col(df, "month")
    scols = [c for c in df.columns if c.lower().startswith("sector")]
    if "sector" not in df.columns and scols:
        df = df.rename(columns={scols[0]: "sector"})
    return df

# Load all train tables
dfs = {}
for key, fname in RAW_FILES.items():
    path = os.path.join(DATA_DIR, fname)
    if os.path.exists(path):
        dfs[key] = load_csv(path)
    else:
        print("Missing file:", fname)

# Merge sequentially
base = dfs["new"].copy()
for k in ["new_near", "pre", "pre_near", "land", "land_near"]:
    if k in dfs:
        right = dfs[k].copy()
        dup_cols = [c for c in right.columns if c in base.columns and c not in [TIME_COL, SECTOR_COL]]
        if dup_cols:
            right = right.drop(columns=dup_cols)
        base = base.merge(right, on=[TIME_COL, SECTOR_COL], how="outer")

# Merge static POI info
if "poi" in dfs:
    poi = dfs["poi"].drop_duplicates(subset=[SECTOR_COL])
    dup_cols = [c for c in poi.columns if c in base.columns and c != SECTOR_COL]
    if dup_cols:
        poi = poi.drop(columns=dup_cols)
    base = base.merge(poi, on=SECTOR_COL, how="left")

# Ensure target column exists
if TARGET not in base.columns:
    cand = [c for c in base.columns if "amount_new_house_transaction" in c]
    base[TARGET] = base[cand[0]] if cand else 0.0

base[TARGET] = base[TARGET].fillna(0)
base = base.sort_values([TIME_COL, SECTOR_COL]).reset_index(drop=True)
print("Merged dataset shape:", base.shape)

# Save intermediate file for reproducibility
base.to_csv(OUT_MERGED, index=False)

C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")

Merged dataset shape: (6432, 178)

# ==========================================================
# 3. EXPLORATORY DATA ANALYSIS
# ==========================================================
print(f"Observations: {base.shape[0]:,}")
print(f"Features: {base.shape[1]:,}")
print("Date range:", base[TIME_COL].min().date(), "→", base[TIME_COL].max().date())
print("Unique sectors:", base[SECTOR_COL].nunique())

# --- Missing values overview
missing_summary = base.isna().mean().sort_values(ascending=False)
print("\nColumns with >20% missing:")
print(missing_summary[missing_summary > 0.2].head(10))

# --- Target distribution
plt.figure(figsize=(6,4))
sns.histplot(base[TARGET], bins=40, kde=True)
plt.title("Distribution of Target: amount_new_house_transaction")
plt.xlabel("Transaction Amount (×10,000 yuan)")
plt.ylabel("Frequency")
plt.show()

# --- Log-transform distribution (for insight, not yet applied)
plt.figure(figsize=(6,4))
sns.histplot(np.log1p(base[TARGET]), bins=40, kde=True, color='darkorange')
plt.title("Log-Transformed Distribution of Target")
plt.xlabel("log(1 + amount_new_house_transaction)")
plt.show()

# --- Temporal mean
monthly_mean = base.groupby(TIME_COL)[TARGET].mean()
plt.figure(figsize=(8,4))
monthly_mean.plot()
plt.title("Average Monthly Transaction Amount Over Time")
plt.ylabel("Mean (×10,000 yuan)")
plt.xlabel("Month")
plt.show()

Observations: 6,432
Features: 178
Date range: 2019-01-01 → 2024-07-01
Unique sectors: 96

Columns with >20% missing:
num_land_transactions_nearby_sectors    0.21875
construction_area_nearby_sectors        0.21875
planned_building_area_nearby_sectors    0.21875
transaction_amount_nearby_sectors       0.21875
dtype: float64

# ==========================================================
# 4. CREATE LEAKAGE-SAFE LAG FEATURES
# ==========================================================
df = base.copy()

drivers = [
    "num_new_house_transactions_nearby_sectors",
    "area_new_house_transactions_nearby_sectors",
    "price_new_house_transactions_nearby_sectors",
    "amount_new_house_transactions_nearby_sectors",
    "num_pre_owned_house_transactions",
    "area_pre_owned_house_transactions",
    "price_pre_owned_house_transactions",
    "amount_pre_owned_house_transactions",
    "num_land_transactions",
    "planned_building_area",
    "transaction_amount",
]

# generate lags 1–3 months behind for each sector
for col in drivers:
    if col in df.columns:
        for k in [1, 2, 3]:
            df[f"{col}_lag{k}"] = df.groupby(SECTOR_COL)[col].shift(k)

# also create lags of the target variable (self-dependence)
for k in [1, 2, 3]:
    df[f"{TARGET}_lag{k}"] = df.groupby(SECTOR_COL)[TARGET].shift(k)

# remove rows with insufficient historical context
df_lagged = df.dropna().reset_index(drop=True)
print("Lagged dataset shape:", df_lagged.shape)

Lagged dataset shape: (3468, 214)

# ==========================================================
# 5. CHRONOLOGICAL SPLIT 70/15/15 + STANDARDIZATION
# ==========================================================
n = len(df_lagged)
train_end = int(0.7 * n)
valid_end = int(0.85 * n)

train_df = df_lagged.iloc[:train_end]
valid_df = df_lagged.iloc[train_end:valid_end]
test_df  = df_lagged.iloc[valid_end:]

print(f"Train: {train_df.shape},  Valid: {valid_df.shape},  Test: {test_df.shape}")

y_train = train_df[TARGET].values
y_valid = valid_df[TARGET].values
y_test  = test_df[TARGET].values

X_cols = [c for c in train_df.columns if c not in [TARGET, TIME_COL, SECTOR_COL]]
X_train = train_df[X_cols].select_dtypes(include="number")
X_valid = valid_df[X_cols].select_dtypes(include="number")
X_test  = test_df[X_cols].select_dtypes(include="number")

# standardize predictors (mean = 0, sd = 1)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_valid_s = scaler.transform(X_valid)
X_test_s  = scaler.transform(X_test)

print("Feature scaling complete.")

Train: (2427, 214),  Valid: (520, 214),  Test: (521, 214)
Feature scaling complete.

# ==========================================================
# 6. LEAKAGE GUARDS + MODEL TRAINING & VALIDATION SELECTION
# ==========================================================

def rmse(y, yhat):
    return math.sqrt(mean_squared_error(y, yhat))

# ------------------------------
# (A) Name-based leakage removal
# ------------------------------
# Drop any column that contains the target name but is not exactly the target
leak_name_cols = [c for c in df_lagged.columns if (TARGET in c) and (c != TARGET)]
print(f"[Leak guard] Dropping by name ({len(leak_name_cols)} columns):")
print(sorted(leak_name_cols)[:8], "..." if len(leak_name_cols) > 8 else "")

# Build a safe working copy without name-leak features
df_safe = df_lagged.drop(columns=leak_name_cols)

# Re-slice the same chronological splits with the safe frame
train_idx = train_df.index
valid_idx = valid_df.index
test_idx  = test_df.index

train_df_safe = df_safe.loc[train_idx]
valid_df_safe = df_safe.loc[valid_idx]
test_df_safe  = df_safe.loc[test_idx]

y_train = train_df_safe[TARGET].values
y_valid = valid_df_safe[TARGET].values
y_test  = test_df_safe[TARGET].values

# Candidate X columns: numeric only, excluding keys/target
exclude_cols = {TARGET, TIME_COL, SECTOR_COL}
X_cols_all = [c for c in df_safe.columns if c not in exclude_cols]
X_cols_all = [c for c in X_cols_all if pd.api.types.is_numeric_dtype(df_safe[c])]

X_train_raw = train_df_safe[X_cols_all]
X_valid_raw = valid_df_safe[X_cols_all]
X_test_raw  = test_df_safe[X_cols_all]

# -------------------------------------------------------
# (B) Data-based guard (train-only high-corr feature drop)
# -------------------------------------------------------
corr_with_y = X_train_raw.corrwith(train_df_safe[TARGET])
hi_corr = corr_with_y[abs(corr_with_y) > 0.995].index.tolist()
if hi_corr:
    print(f"[Leak guard] Dropping by high correlation in train ({len(hi_corr)}):")
    print(sorted(hi_corr)[:8], "..." if len(hi_corr) > 8 else "")
else:
    print("[Leak guard] No features exceeded |corr| > 0.995 in training.")

X_cols = [c for c in X_cols_all if c not in hi_corr]

# Final feature matrices (after both guards)
X_train = X_train_raw[X_cols].copy()
X_valid = X_valid_raw[X_cols].copy()
X_test  = X_test_raw[X_cols].copy()

# Standardize (fit on train only)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_valid_s = scaler.transform(X_valid)
X_test_s  = scaler.transform(X_test)

print(f"Final feature count after guards: {X_train.shape[1]}")

# ------------------------------
# (C) Train models on TRAIN only
# ------------------------------
val_results = []

# OLS
ols = LinearRegression().fit(X_train_s, y_train)
pred_ols = ols.predict(X_valid_s)
val_results.append([
    "OLS",
    rmse(y_valid, pred_ols),
    mean_absolute_error(y_valid, pred_ols),
    r2_score(y_valid, pred_ols),
    "None"
])

# Ridge
alphas = np.logspace(-3, 3, 13)
ridge = GridSearchCV(Ridge(), {"alpha": alphas},
                     scoring="neg_root_mean_squared_error", cv=3)
ridge.fit(X_train_s, y_train)
pred_ridge = ridge.best_estimator_.predict(X_valid_s)
val_results.append([
    "Ridge",
    rmse(y_valid, pred_ridge),
    mean_absolute_error(y_valid, pred_ridge),
    r2_score(y_valid, pred_ridge),
    f"alpha={ridge.best_params_['alpha']}"
])

# LASSO
lasso = GridSearchCV(Lasso(max_iter=10000), {"alpha": alphas},
                     scoring="neg_root_mean_squared_error", cv=3)
lasso.fit(X_train_s, y_train)
pred_lasso = lasso.best_estimator_.predict(X_valid_s)
val_results.append([
    "LASSO",
    rmse(y_valid, pred_lasso),
    mean_absolute_error(y_valid, pred_lasso),
    r2_score(y_valid, pred_lasso),
    f"alpha={lasso.best_params_['alpha']}"
])

# kNN (non-parametric baseline)
best_rm, best_k, best_pred = float("inf"), None, None
for k in [3, 5, 7, 9, 11, 15]:
    knn = KNeighborsRegressor(n_neighbors=k, weights="distance").fit(X_train_s, y_train)
    p = knn.predict(X_valid_s)
    r = rmse(y_valid, p)
    if r < best_rm:
        best_rm, best_k, best_pred = r, k, p

val_results.append([
    "kNN",
    best_rm,
    mean_absolute_error(y_valid, best_pred),
    r2_score(y_valid, best_pred),
    f"k={best_k}"
])

# Validation summary table
tbl = (pd.DataFrame(val_results, columns=["Model","RMSE_valid","MAE_valid","R2_valid","Notes"])
         .sort_values("RMSE_valid")
         .reset_index(drop=True))
display(tbl)

[Leak guard] Dropping by name (8 columns):
['amount_new_house_transaction_lag1', 'amount_new_house_transaction_lag2', 'amount_new_house_transaction_lag3', 'amount_new_house_transactions', 'amount_new_house_transactions_nearby_sectors', 'amount_new_house_transactions_nearby_sectors_lag1', 'amount_new_house_transactions_nearby_sectors_lag2', 'amount_new_house_transactions_nearby_sectors_lag3'] 
[Leak guard] No features exceeded |corr| > 0.995 in training.
Final feature count after guards: 203

c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\numpy\lib\_function_base_impl.py:3065: RuntimeWarning: invalid value encountered in divide
  c /= stddev[:, None]
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\numpy\lib\_function_base_impl.py:3066: RuntimeWarning: invalid value encountered in divide
  c /= stddev[None, :]
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.348e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.457e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.786e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.347e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.456e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.785e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.344e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.453e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.783e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.334e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.442e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.774e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.303e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.409e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.748e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.202e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.303e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.662e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.867e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.571e+10, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.329e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.845e+10, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.870e+10, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.020e+10, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.585e+08, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.749e+09, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.422e+08, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.112e+11, tolerance: 6.302e+08
  model = cd_fast.enet_coordinate_descent(

# ==========================================================
# 7. FINAL RETRAIN AND TEST EVALUATION
# ==========================================================
best = tbl.iloc[0]
best_model_name = best["Model"]
print("Selected model based on validation RMSE:", best_model_name)

# Combine train + validation
X_trval = np.vstack([X_train_s, X_valid_s])
y_trval = np.concatenate([y_train, y_valid])

# Retrain the best model
if best_model_name == "OLS":
    final_model = LinearRegression().fit(X_trval, y_trval)
elif best_model_name == "Ridge":
    alpha_val = float(best["Notes"].split("=")[-1])
    final_model = Ridge(alpha=alpha_val).fit(X_trval, y_trval)
elif best_model_name == "LASSO":
    alpha_val = float(best["Notes"].split("=")[-1])
    final_model = Lasso(alpha=alpha_val, max_iter=10000).fit(X_trval, y_trval)
elif best_model_name == "kNN":
    k_val = int(best["Notes"].split("=")[-1])
    final_model = KNeighborsRegressor(n_neighbors=k_val, weights="distance").fit(X_trval, y_trval)
else:
    raise ValueError("Unexpected model selection.")

# Evaluate on test set
pred_test = final_model.predict(X_test_s)
rmse_test = math.sqrt(mean_squared_error(y_test, pred_test))
mae_test  = mean_absolute_error(y_test, pred_test)
r2_test   = r2_score(y_test, pred_test)

print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test MAE:  {mae_test:.2f}")
print(f"Test R²:   {r2_test:.3f}")

# Scatter plot for diagnostic visualization
plt.figure(figsize=(6,5))
plt.scatter(y_test, pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color="red", linestyle="--")
plt.xlabel("Actual Transaction Amount (×10,000 yuan)")
plt.ylabel("Predicted Transaction Amount")
plt.title(f"Test Set Prediction vs Actual ({best_model_name})")
plt.show()

Selected model based on validation RMSE: OLS
Test RMSE: 15767.52
Test MAE:  9534.94
Test R²:   0.777

# ==========================================================
# 1. ENVIRONMENT SETUP
# ==========================================================
# This cell ensures all dependencies are installed correctly even on a fresh system.

%pip install --quiet --upgrade pip
%pip install --quiet pandas numpy matplotlib scikit-learn seaborn jupyterlab notebook

import os, re, math, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

pd.set_option("display.max_columns", 120)
sns.set_style("whitegrid")

# Project directory (edit if different)
DATA_DIR = "./train"
OUT_MERGED = "./merged_train.csv"
TARGET = "amount_new_house_transaction"
TIME_COL = "month"
SECTOR_COL = "sector"
RANDOM_STATE = 42

print("Environment successfully prepared.")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Environment successfully prepared.

# ==========================================================
# 2. LOAD AND MERGE TRAINING DATA
# ==========================================================

RAW_FILES = {
    "new": "new_house_transactions.csv",
    "new_near": "new_house_transactions_nearby_sectors.csv",
    "pre": "pre_owned_house_transactions.csv",
    "pre_near": "pre_owned_house_transactions_nearby_sectors.csv",
    "land": "land_transactions.csv",
    "land_near": "land_transactions_nearby_sectors.csv",
    "poi": "sector_POI.csv",
}

def parse_month_col(df, col="month"):
    """Convert mixed Chinese/English month formats into datetime (YYYY-MM)."""
    if col not in df.columns:
        return df
    df[col] = df[col].astype(str).str.replace(r"_sector\s*\d+$", "", regex=True).str.strip()
    parsed = pd.to_datetime(df[col], errors="coerce")
    mask = parsed.isna() & df[col].str.contains("年", na=False)
    if mask.any():
        def cn_parse(s):
            m = re.match(r"^(\d{4})年(\d{1,2})月", str(s))
            if m:
                return pd.Timestamp(year=int(m.group(1)), month=int(m.group(2)), day=1)
            return pd.NaT
        parsed.loc[mask] = df.loc[mask, col].apply(cn_parse)
    df[col] = parsed.dt.to_period("M").dt.to_timestamp()
    return df

def load_csv(path):
    df = pd.read_csv(path)
    if "month" in df.columns:
        df = parse_month_col(df, "month")
    scols = [c for c in df.columns if c.lower().startswith("sector")]
    if "sector" not in df.columns and scols:
        df = df.rename(columns={scols[0]: "sector"})
    return df

# Load all train tables
dfs = {}
for key, fname in RAW_FILES.items():
    path = os.path.join(DATA_DIR, fname)
    if os.path.exists(path):
        dfs[key] = load_csv(path)
    else:
        print("Missing file:", fname)

# Merge sequentially
base = dfs["new"].copy()
for k in ["new_near", "pre", "pre_near", "land", "land_near"]:
    if k in dfs:
        right = dfs[k].copy()
        dup_cols = [c for c in right.columns if c in base.columns and c not in [TIME_COL, SECTOR_COL]]
        if dup_cols:
            right = right.drop(columns=dup_cols)
        base = base.merge(right, on=[TIME_COL, SECTOR_COL], how="outer")

# Merge static POI info
if "poi" in dfs:
    poi = dfs["poi"].drop_duplicates(subset=[SECTOR_COL])
    dup_cols = [c for c in poi.columns if c in base.columns and c != SECTOR_COL]
    if dup_cols:
        poi = poi.drop(columns=dup_cols)
    base = base.merge(poi, on=SECTOR_COL, how="left")

# Ensure target column exists
if TARGET not in base.columns:
    cand = [c for c in base.columns if "amount_new_house_transaction" in c]
    base[TARGET] = base[cand[0]] if cand else 0.0

base[TARGET] = base[TARGET].fillna(0)
base = base.sort_values([TIME_COL, SECTOR_COL]).reset_index(drop=True)
print("Merged dataset shape:", base.shape)

# Save intermediate file for reproducibility
base.to_csv(OUT_MERGED, index=False)

C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")
C:\Users\shuan\AppData\Local\Temp\ipykernel_36304\2008762211.py:20: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  parsed = pd.to_datetime(df[col], errors="coerce")

Merged dataset shape: (6432, 178)

# ==========================================================
# 3. EXPLORATORY DATA ANALYSIS
# ==========================================================
print(f"Observations: {base.shape[0]:,}")
print(f"Features: {base.shape[1]:,}")
print("Date range:", base[TIME_COL].min().date(), "→", base[TIME_COL].max().date())
print("Unique sectors:", base[SECTOR_COL].nunique())

# --- Missing values overview
missing_summary = base.isna().mean().sort_values(ascending=False)
print("\nColumns with >20% missing:")
print(missing_summary[missing_summary > 0.2].head(10))

# --- Target distribution
plt.figure(figsize=(6,4))
sns.histplot(base[TARGET], bins=40, kde=True)
plt.title("Distribution of Target: amount_new_house_transaction")
plt.xlabel("Transaction Amount (×10,000 yuan)")
plt.ylabel("Frequency")
plt.show()

# --- Log-transform distribution (for insight, not yet applied)
plt.figure(figsize=(6,4))
sns.histplot(np.log1p(base[TARGET]), bins=40, kde=True, color='darkorange')
plt.title("Log-Transformed Distribution of Target")
plt.xlabel("log(1 + amount_new_house_transaction)")
plt.show()

# --- Temporal mean
monthly_mean = base.groupby(TIME_COL)[TARGET].mean()
plt.figure(figsize=(8,4))
monthly_mean.plot()
plt.title("Average Monthly Transaction Amount Over Time")
plt.ylabel("Mean (×10,000 yuan)")
plt.xlabel("Month")
plt.show()

Observations: 6,432
Features: 178
Date range: 2019-01-01 → 2024-07-01
Unique sectors: 96

Columns with >20% missing:
num_land_transactions_nearby_sectors    0.21875
construction_area_nearby_sectors        0.21875
planned_building_area_nearby_sectors    0.21875
transaction_amount_nearby_sectors       0.21875
dtype: float64

# ==========================================================
# 4. CREATE LEAKAGE-SAFE LAG FEATURES
# ==========================================================
df = base.copy()

drivers = [
    "num_new_house_transactions_nearby_sectors",
    "area_new_house_transactions_nearby_sectors",
    "price_new_house_transactions_nearby_sectors",
    "amount_new_house_transactions_nearby_sectors",
    "num_pre_owned_house_transactions",
    "area_pre_owned_house_transactions",
    "price_pre_owned_house_transactions",
    "amount_pre_owned_house_transactions",
    "num_land_transactions",
    "planned_building_area",
    "transaction_amount",
]

# generate lags 1–3 months behind for each sector
for col in drivers:
    if col in df.columns:
        for k in [1, 2, 3]:
            df[f"{col}_lag{k}"] = df.groupby(SECTOR_COL)[col].shift(k)

# also create lags of the target variable (self-dependence)
for k in [1, 2, 3]:
    df[f"{TARGET}_lag{k}"] = df.groupby(SECTOR_COL)[TARGET].shift(k)

# remove rows with insufficient historical context
df_lagged = df.dropna().reset_index(drop=True)
print("Lagged dataset shape:", df_lagged.shape)

Lagged dataset shape: (3468, 214)

# ==========================================================
# 5. CHRONOLOGICAL SPLIT 70/15/15 + STANDARDIZATION
# ==========================================================
n = len(df_lagged)
train_end = int(0.7 * n)
valid_end = int(0.85 * n)

train_df = df_lagged.iloc[:train_end]
valid_df = df_lagged.iloc[train_end:valid_end]
test_df  = df_lagged.iloc[valid_end:]

print(f"Train: {train_df.shape},  Valid: {valid_df.shape},  Test: {test_df.shape}")

y_train = train_df[TARGET].values
y_valid = valid_df[TARGET].values
y_test  = test_df[TARGET].values

X_cols = [c for c in train_df.columns if c not in [TARGET, TIME_COL, SECTOR_COL]]
X_train = train_df[X_cols].select_dtypes(include="number")
X_valid = valid_df[X_cols].select_dtypes(include="number")
X_test  = test_df[X_cols].select_dtypes(include="number")

# standardize predictors (mean = 0, sd = 1)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_valid_s = scaler.transform(X_valid)
X_test_s  = scaler.transform(X_test)

print("Feature scaling complete.")

Train: (2427, 214),  Valid: (520, 214),  Test: (521, 214)
Feature scaling complete.

# ==========================================================
# 6. LEAKAGE GUARDS + MODEL TRAINING & VALIDATION SELECTION
# ==========================================================

def rmse(y, yhat):
    return math.sqrt(mean_squared_error(y, yhat))

# ------------------------------
# (A) Name-based leakage removal
# ------------------------------
# Drop any column that contains the target name but is not exactly the target
leak_name_cols = [c for c in df_lagged.columns if (TARGET in c) and (c != TARGET)]
print(f"[Leak guard] Dropping by name ({len(leak_name_cols)} columns):")
print(sorted(leak_name_cols)[:8], "..." if len(leak_name_cols) > 8 else "")

# Build a safe working copy without name-leak features
df_safe = df_lagged.drop(columns=leak_name_cols)

# Re-slice the same chronological splits with the safe frame
train_idx = train_df.index
valid_idx = valid_df.index
test_idx  = test_df.index

train_df_safe = df_safe.loc[train_idx]
valid_df_safe = df_safe.loc[valid_idx]
test_df_safe  = df_safe.loc[test_idx]

y_train = train_df_safe[TARGET].values
y_valid = valid_df_safe[TARGET].values
y_test  = test_df_safe[TARGET].values

# Candidate X columns: numeric only, excluding keys/target
exclude_cols = {TARGET, TIME_COL, SECTOR_COL}
X_cols_all = [c for c in df_safe.columns if c not in exclude_cols]
X_cols_all = [c for c in X_cols_all if pd.api.types.is_numeric_dtype(df_safe[c])]

X_train_raw = train_df_safe[X_cols_all]
X_valid_raw = valid_df_safe[X_cols_all]
X_test_raw  = test_df_safe[X_cols_all]

# -------------------------------------------------------
# (B) Data-based guard (train-only high-corr feature drop)
# -------------------------------------------------------
corr_with_y = X_train_raw.corrwith(train_df_safe[TARGET])
hi_corr = corr_with_y[abs(corr_with_y) > 0.995].index.tolist()
if hi_corr:
    print(f"[Leak guard] Dropping by high correlation in train ({len(hi_corr)}):")
    print(sorted(hi_corr)[:8], "..." if len(hi_corr) > 8 else "")
else:
    print("[Leak guard] No features exceeded |corr| > 0.995 in training.")

X_cols = [c for c in X_cols_all if c not in hi_corr]

# Final feature matrices (after both guards)
X_train = X_train_raw[X_cols].copy()
X_valid = X_valid_raw[X_cols].copy()
X_test  = X_test_raw[X_cols].copy()

# Standardize (fit on train only)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_valid_s = scaler.transform(X_valid)
X_test_s  = scaler.transform(X_test)

print(f"Final feature count after guards: {X_train.shape[1]}")

# ------------------------------
# (C) Train models on TRAIN only
# ------------------------------
val_results = []

# OLS
ols = LinearRegression().fit(X_train_s, y_train)
pred_ols = ols.predict(X_valid_s)
val_results.append([
    "OLS",
    rmse(y_valid, pred_ols),
    mean_absolute_error(y_valid, pred_ols),
    r2_score(y_valid, pred_ols),
    "None"
])

# Ridge
alphas = np.logspace(-3, 3, 13)
ridge = GridSearchCV(Ridge(), {"alpha": alphas},
                     scoring="neg_root_mean_squared_error", cv=3)
ridge.fit(X_train_s, y_train)
pred_ridge = ridge.best_estimator_.predict(X_valid_s)
val_results.append([
    "Ridge",
    rmse(y_valid, pred_ridge),
    mean_absolute_error(y_valid, pred_ridge),
    r2_score(y_valid, pred_ridge),
    f"alpha={ridge.best_params_['alpha']}"
])

# LASSO
lasso = GridSearchCV(Lasso(max_iter=10000), {"alpha": alphas},
                     scoring="neg_root_mean_squared_error", cv=3)
lasso.fit(X_train_s, y_train)
pred_lasso = lasso.best_estimator_.predict(X_valid_s)
val_results.append([
    "LASSO",
    rmse(y_valid, pred_lasso),
    mean_absolute_error(y_valid, pred_lasso),
    r2_score(y_valid, pred_lasso),
    f"alpha={lasso.best_params_['alpha']}"
])

# kNN (non-parametric baseline)
best_rm, best_k, best_pred = float("inf"), None, None
for k in [3, 5, 7, 9, 11, 15]:
    knn = KNeighborsRegressor(n_neighbors=k, weights="distance").fit(X_train_s, y_train)
    p = knn.predict(X_valid_s)
    r = rmse(y_valid, p)
    if r < best_rm:
        best_rm, best_k, best_pred = r, k, p

val_results.append([
    "kNN",
    best_rm,
    mean_absolute_error(y_valid, best_pred),
    r2_score(y_valid, best_pred),
    f"k={best_k}"
])

# Validation summary table
tbl = (pd.DataFrame(val_results, columns=["Model","RMSE_valid","MAE_valid","R2_valid","Notes"])
         .sort_values("RMSE_valid")
         .reset_index(drop=True))
display(tbl)

[Leak guard] Dropping by name (8 columns):
['amount_new_house_transaction_lag1', 'amount_new_house_transaction_lag2', 'amount_new_house_transaction_lag3', 'amount_new_house_transactions', 'amount_new_house_transactions_nearby_sectors', 'amount_new_house_transactions_nearby_sectors_lag1', 'amount_new_house_transactions_nearby_sectors_lag2', 'amount_new_house_transactions_nearby_sectors_lag3'] 
[Leak guard] No features exceeded |corr| > 0.995 in training.
Final feature count after guards: 203

c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\numpy\lib\_function_base_impl.py:3065: RuntimeWarning: invalid value encountered in divide
  c /= stddev[:, None]
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\numpy\lib\_function_base_impl.py:3066: RuntimeWarning: invalid value encountered in divide
  c /= stddev[None, :]
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.348e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.457e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.786e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.347e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.456e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.785e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.344e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.453e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.783e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.334e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.442e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.774e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.303e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.409e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.748e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.202e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.303e+11, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.662e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.867e+11, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.571e+10, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.329e+11, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.845e+10, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.870e+10, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.020e+10, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.585e+08, tolerance: 5.022e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.749e+09, tolerance: 3.237e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.422e+08, tolerance: 4.270e+08
  model = cd_fast.enet_coordinate_descent(
c:\Users\shuan\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:695: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.112e+11, tolerance: 6.302e+08
  model = cd_fast.enet_coordinate_descent(

# ==========================================================
# 7. FINAL RETRAIN AND TEST EVALUATION
# ==========================================================
best = tbl.iloc[0]
best_model_name = best["Model"]
print("Selected model based on validation RMSE:", best_model_name)

# Combine train + validation
X_trval = np.vstack([X_train_s, X_valid_s])
y_trval = np.concatenate([y_train, y_valid])

# Retrain the best model
if best_model_name == "OLS":
    final_model = LinearRegression().fit(X_trval, y_trval)
elif best_model_name == "Ridge":
    alpha_val = float(best["Notes"].split("=")[-1])
    final_model = Ridge(alpha=alpha_val).fit(X_trval, y_trval)
elif best_model_name == "LASSO":
    alpha_val = float(best["Notes"].split("=")[-1])
    final_model = Lasso(alpha=alpha_val, max_iter=10000).fit(X_trval, y_trval)
elif best_model_name == "kNN":
    k_val = int(best["Notes"].split("=")[-1])
    final_model = KNeighborsRegressor(n_neighbors=k_val, weights="distance").fit(X_trval, y_trval)
else:
    raise ValueError("Unexpected model selection.")

# Evaluate on test set
pred_test = final_model.predict(X_test_s)
rmse_test = math.sqrt(mean_squared_error(y_test, pred_test))
mae_test  = mean_absolute_error(y_test, pred_test)
r2_test   = r2_score(y_test, pred_test)

print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test MAE:  {mae_test:.2f}")
print(f"Test R²:   {r2_test:.3f}")

# Scatter plot for diagnostic visualization
plt.figure(figsize=(6,5))
plt.scatter(y_test, pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color="red", linestyle="--")
plt.xlabel("Actual Transaction Amount (×10,000 yuan)")
plt.ylabel("Predicted Transaction Amount")
plt.title(f"Test Set Prediction vs Actual ({best_model_name})")
plt.show()

Selected model based on validation RMSE: OLS
Test RMSE: 15767.52
Test MAE:  9534.94
Test R²:   0.777

File	Description
`new_house_transactions.csv`	Core table — monthly new-house transaction statistics (target variable included).
`*_nearby_sectors.csv`	Aggregated indicators of neighboring areas (to capture spatial effects).
`pre_owned_house_transactions.csv`	Market activity for pre-owned homes.
`land_transactions.csv`	Land-sale metrics influencing housing supply.
`sector_POI.csv`	Static geographic and demographic information for each sector.

Subset	Purpose	Notes
Training (70 %)	Fit model parameters.	Used in cross-validation and hyper-parameter tuning.
Validation (15 %)	Model comparison & selection.	Guides choice among OLS, Ridge, LASSO, and kNN.
Test (15 %)	Final unbiased evaluation.	Held out until model selection is complete.

Model	Type	Key Properties	Intended Role
Ordinary Least Squares (OLS)	Linear, unregularized	Baseline parametric estimator; interpretable coefficients.	Establishes benchmark performance and interpretability.
Ridge Regression	Linear, L2-regularized	Shrinks correlated coefficients; reduces variance.	Handles multicollinearity in overlapping economic predictors.
LASSO Regression	Linear, L1-regularized	Performs feature selection; enforces sparsity.	Identifies most influential predictors among many.
k-Nearest Neighbors (kNN)	Non-parametric	Uses local averaging; captures mild non-linearity.	Serves as contrast to linear models and baseline non-parametric check.

Metric	Meaning	Desirable Direction
RMSE	Root Mean Squared Error; penalizes large deviations.	↓ lower
MAE	Mean Absolute Error; measures median accuracy.	↓ lower
R²	Coefficient of Determination; proportion of variance explained.	↑ higher

	Model	RMSE_valid	MAE_valid	R2_valid	Notes
0	OLS	16328.312033	9452.142007	0.830851	None
1	Ridge	16331.557753	9448.050170	0.830784	alpha=0.001
2	LASSO	16758.636044	9502.708410	0.821818	alpha=0.001
3	kNN	28713.247947	16430.831038	0.476939	k=3

China Real Estate Demand Prediction¶

Final Project: Model Selection and Evaluation in Statistical Learning¶

China Real Estate Demand Prediction¶

Dataset Selection and Description¶

2 Data Understanding and Merging¶

Notes on Data Preprocessing¶

3 Exploratory Data Analysis (EDA)¶

Observations¶

4 Leakage-Safe Lag Feature Engineering¶

5 Chronological Train / Validation / Test Split¶

Discussion¶

6 Model Rationale, Leakage Prevention, and Validation-Based Selection¶

Leakage prevention (critical)¶

Interpretation of Validation Comparison¶

7 Final Model Retraining and Test Evaluation¶

8 Conclusions and Discussion¶

Model Comparison Summary¶

Reasons for Selecting OLS Regression¶

Interpretation and Insights¶

Limitations and Future Work¶

Educational Reflection¶

9 References and Acknowledgments¶

China Real Estate Demand Prediction¶

Final Project: Model Selection and Evaluation in Statistical Learning¶

China Real Estate Demand Prediction¶

Dataset Selection and Description¶

2 Data Understanding and Merging¶

Notes on Data Preprocessing¶

3 Exploratory Data Analysis (EDA)¶

Observations¶

4 Leakage-Safe Lag Feature Engineering¶

5 Chronological Train / Validation / Test Split¶

Discussion¶

6 Model Rationale, Leakage Prevention, and Validation-Based Selection¶

Leakage prevention (critical)¶

Interpretation of Validation Comparison¶

7 Final Model Retraining and Test Evaluation¶

8 Conclusions and Discussion¶

Model Comparison Summary¶

Reasons for Selecting OLS Regression¶

Interpretation and Insights¶

Limitations and Future Work¶

Educational Reflection¶

9 References and Acknowledgments¶