In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import numpy as np

import pandas as pd

%matplotlib inline

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
In [2]:
mpg_data = pd.read_csv('auto-mpg.csv.gz', sep='|')
In [3]:
mpg_data
Out[3]:
MPG Cylinders Displacement Horsepower Weight Acceleration Model Year Origin Car Name
0 18.0 8 307.0 130.0 3504.0 12.0 70 USA chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693.0 11.5 70 USA buick skylark 320
2 18.0 8 318.0 150.0 3436.0 11.0 70 USA plymouth satellite
3 16.0 8 304.0 150.0 3433.0 12.0 70 USA amc rebel sst
4 17.0 8 302.0 140.0 3449.0 10.5 70 USA ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790.0 15.6 82 USA ford mustang gl
394 44.0 4 97.0 52.0 2130.0 24.6 82 Europe vw pickup
395 32.0 4 135.0 84.0 2295.0 11.6 82 USA dodge rampage
396 28.0 4 120.0 79.0 2625.0 18.6 82 USA ford ranger
397 31.0 4 119.0 82.0 2720.0 19.4 82 USA chevy s-10

398 rows × 9 columns

In [4]:
mpg_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    object 
 8   Car Name      398 non-null    object 
dtypes: float64(5), int64(2), object(2)
memory usage: 28.1+ KB
In [5]:
mpg_data['Origin'].unique()
Out[5]:
array(['USA', 'Japan', 'Europe'], dtype=object)
In [6]:
mpg_data = mpg_data.drop('Car Name', axis=1)
In [7]:
sns.pairplot(mpg_data[['MPG', 'Cylinders', 'Displacement', 'Weight', 'Horsepower']], diag_kind='kde')
Out[7]:
<seaborn.axisgrid.PairGrid at 0x7fd8ede445e0>
In [8]:
corr = mpg_data.corr()
sns.heatmap(corr, annot=True)
Out[8]:
<AxesSubplot:>
In [9]:
mpg_data_labels = mpg_data['MPG'].copy().to_numpy()
mpg_data_features = mpg_data.drop('MPG', axis=1)
In [10]:
num_tr_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_tr_pipeline = Pipeline([
        ('ordinal_encoder', OrdinalEncoder()),
        ('one_hot_encoder', OneHotEncoder()),
    ])
In [11]:
num_attribs = [col for col in mpg_data_features.columns if col != 'Origin']
cat_attribs = ['Origin']

full_pipeline = ColumnTransformer([
        ("num_tr_pipeline", num_tr_pipeline, num_attribs),
        ("cat_tr_pipeline", cat_tr_pipeline, cat_attribs),
    ])

mpg_transformed = full_pipeline.fit_transform(mpg_data_features)
In [12]:
mpg_train_data, mpg_test_data, mpg_train_labels, mpg_test_labels = train_test_split(mpg_transformed, mpg_data_labels, test_size=0.3, random_state=0)
In [13]:
lin_reg = LinearRegression()
lin_reg.fit(mpg_train_data, mpg_train_labels)
mpg_test_predicted = lin_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_labels, mpg_test_predicted, squared=True))
Out[13]:
3.366034587971452
In [14]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=50, random_state=0)
forest_reg.fit(mpg_train_data, mpg_train_labels)
mpg_test_predicted = forest_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_predicted, mpg_test_labels, squared=True))
Out[14]:
2.7287667360915995