from pathlib import Path

import numpy as np
import onnxruntime as ort
import pandas as pd
import skl2onnx
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

/tmp/ipykernel_1984/2968630966.py:5: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

dvf_38 = pd.read_csv(
    "https://files.data.gouv.fr/geo-dvf/latest/csv/2022/departements/38.csv.gz"
)
dvf_38.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75871 entries, 0 to 75870
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_mutation                   75871 non-null  object 
 1   date_mutation                 75871 non-null  object 
 2   numero_disposition            75871 non-null  int64  
 3   nature_mutation               75871 non-null  object 
 4   valeur_fonciere               75504 non-null  float64
 5   adresse_numero                48243 non-null  float64
 6   adresse_suffixe               2482 non-null   object 
 7   adresse_nom_voie              74526 non-null  object 
 8   adresse_code_voie             74529 non-null  object 
 9   code_postal                   74528 non-null  float64
 10  code_commune                  75871 non-null  int64  
 11  nom_commune                   75871 non-null  object 
 12  code_departement              75871 non-null  int64  
 13  ancien_code_commune           0 non-null      float64
 14  ancien_nom_commune            0 non-null      float64
 15  id_parcelle                   75871 non-null  object 
 16  ancien_id_parcelle            0 non-null      float64
 17  numero_volume                 150 non-null    float64
 18  lot1_numero                   33401 non-null  object 
 19  lot1_surface_carrez           8391 non-null   float64
 20  lot2_numero                   9313 non-null   float64
 21  lot2_surface_carrez           3167 non-null   float64
 22  lot3_numero                   1215 non-null   float64
 23  lot3_surface_carrez           208 non-null    float64
 24  lot4_numero                   353 non-null    float64
 25  lot4_surface_carrez           48 non-null     float64
 26  lot5_numero                   180 non-null    float64
 27  lot5_surface_carrez           29 non-null     float64
 28  nombre_lots                   75871 non-null  int64  
 29  code_type_local               46068 non-null  float64
 30  type_local                    46068 non-null  object 
 31  surface_reelle_bati           25589 non-null  float64
 32  nombre_pieces_principales     46034 non-null  float64
 33  code_nature_culture           42130 non-null  object 
 34  nature_culture                42130 non-null  object 
 35  code_nature_culture_speciale  2945 non-null   object 
 36  nature_culture_speciale       2945 non-null   object 
 37  surface_terrain               42130 non-null  float64
 38  longitude                     73674 non-null  float64
 39  latitude                      73674 non-null  float64
dtypes: float64(22), int64(4), object(14)
memory usage: 23.2+ MB

/tmp/ipykernel_1984/1738013210.py:1: DtypeWarning: Columns (18) have mixed types. Specify dtype option on import or set low_memory=False.
  dvf_38 = pd.read_csv(

dataset = dvf_38.copy()
dataset = dataset[
    (dataset.nature_mutation == "Vente")
    & (dataset.type_local == "Appartement")
    & (dataset.nom_commune == "Grenoble")
]
dataset = dataset[
    [
        "surface_reelle_bati",
        "nombre_pieces_principales",
        "latitude",
        "longitude",
        "valeur_fonciere",
    ]
]
dataset = dataset.rename(
    columns={
        "surface_reelle_bati": "area",
        "nombre_pieces_principales": "rooms",
        "valeur_fonciere": "value",
    }
)
dataset = dataset.dropna()
dataset = dataset.reset_index()
dataset

X = dataset[
    ["area", "rooms", "latitude", "longitude"]
]
y = dataset["value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression()),
    ]
)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', LinearRegression())])

Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor', LinearRegression())])

StandardScaler()

LinearRegression()

root_mean_squared_error(y_test, pipeline.predict(X_test))

290714.8123380589

pipeline.predict([[50, 3, 45.1893525, 5.7216074]])

/home/runner/.local/share/virtualenvs/2024-02-23-ml-models-web-GaDE5OIw/lib/python3.11/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(

array([264748.35412483])

onnx_model = skl2onnx.to_onnx(pipeline, X_train[:1].astype(np.float32))
onnx_model_path = Path() / "model.onnx"
onnx_model_path.write_bytes(onnx_model.SerializeToString())

580

session = ort.InferenceSession(onnx_model_path, providers=ort.get_available_providers())
session.run(
    None,
    {
        "area": [[50.0]],
        "rooms": [[3.0]],
        "latitude": [[45.1893525]],
        "longitude": [[5.7216074]],
    },
)

[array([[264750.3]], dtype=float32)]

	index	area	rooms	latitude	longitude	value
0	1	70.0	3.0	45.176163	5.719166	225000.0
1	6	109.0	4.0	45.187065	5.718309	257900.0
2	15	54.0	2.0	45.181912	5.711105	151500.0
3	26	97.0	5.0	45.173124	5.708733	160000.0
4	31	31.0	1.0	45.182767	5.743471	87000.0
...	...	...	...	...	...	...
3523	44672	54.0	3.0	45.179669	5.717220	165500.0
3524	44679	74.0	5.0	45.180877	5.711429	127000.0
3525	44688	61.0	3.0	45.166853	5.726352	110000.0
3526	44691	73.0	3.0	45.181464	5.720759	192000.0
3527	44692	57.0	4.0	45.169246	5.723737	112420.0

Housing value estimation model training¶