import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
%matplotlib inline
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
mpg_data = pd.read_csv('auto-mpg.csv.gz', sep='|')
mpg_data
MPG | Cylinders | Displacement | Horsepower | Weight | Acceleration | Model Year | Origin | Car Name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | USA | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | USA | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436.0 | 11.0 | 70 | USA | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433.0 | 12.0 | 70 | USA | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449.0 | 10.5 | 70 | USA | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790.0 | 15.6 | 82 | USA | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130.0 | 24.6 | 82 | Europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295.0 | 11.6 | 82 | USA | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625.0 | 18.6 | 82 | USA | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720.0 | 19.4 | 82 | USA | chevy s-10 |
398 rows × 9 columns
mpg_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MPG 398 non-null float64 1 Cylinders 398 non-null int64 2 Displacement 398 non-null float64 3 Horsepower 392 non-null float64 4 Weight 398 non-null float64 5 Acceleration 398 non-null float64 6 Model Year 398 non-null int64 7 Origin 398 non-null object 8 Car Name 398 non-null object dtypes: float64(5), int64(2), object(2) memory usage: 28.1+ KB
mpg_data['Origin'].unique()
array(['USA', 'Japan', 'Europe'], dtype=object)
mpg_data = mpg_data.drop('Car Name', axis=1)
sns.pairplot(mpg_data[['MPG', 'Cylinders', 'Displacement', 'Weight', 'Horsepower']], diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7fd8ede445e0>
corr = mpg_data.corr()
sns.heatmap(corr, annot=True)
<AxesSubplot:>
mpg_data_labels = mpg_data['MPG'].copy().to_numpy()
mpg_data_features = mpg_data.drop('MPG', axis=1)
num_tr_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_tr_pipeline = Pipeline([
('ordinal_encoder', OrdinalEncoder()),
('one_hot_encoder', OneHotEncoder()),
])
num_attribs = [col for col in mpg_data_features.columns if col != 'Origin']
cat_attribs = ['Origin']
full_pipeline = ColumnTransformer([
("num_tr_pipeline", num_tr_pipeline, num_attribs),
("cat_tr_pipeline", cat_tr_pipeline, cat_attribs),
])
mpg_transformed = full_pipeline.fit_transform(mpg_data_features)
mpg_train_data, mpg_test_data, mpg_train_labels, mpg_test_labels = train_test_split(mpg_transformed, mpg_data_labels, test_size=0.3, random_state=0)
lin_reg = LinearRegression()
lin_reg.fit(mpg_train_data, mpg_train_labels)
mpg_test_predicted = lin_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_labels, mpg_test_predicted, squared=True))
3.366034587971452
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=50, random_state=0)
forest_reg.fit(mpg_train_data, mpg_train_labels)
mpg_test_predicted = forest_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_predicted, mpg_test_labels, squared=True))
2.7287667360915995