In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
train_df = pd.read_excel('Data_Train.xlsx')
In [3]:
pd.set_option('display.max_columns', None)
In [4]:
train_df.head()
Out[4]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662
2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882
3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218
4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302
In [5]:
train_df.shape
Out[5]:
(10683, 11)
In [6]:
type(train_df)
Out[6]:
pandas.core.frame.DataFrame
In [7]:
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB
In [8]:
train_df.Duration.value_counts()
Out[8]:
2h 50m     550
1h 30m     386
2h 45m     337
2h 55m     337
2h 35m     329
          ... 
31h 30m      1
30h 25m      1
42h 5m       1
4h 10m       1
47h 40m      1
Name: Duration, Length: 368, dtype: int64
In [9]:
train_df.isna().sum()
Out[9]:
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64
In [10]:
train_df.dropna(inplace=True)
In [11]:
train_df.isna().sum()
Out[11]:
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

EDA¶

In [12]:
train_df['Journy_Day'] = pd.to_datetime(train_df.Date_of_Journey, format='%d/%m/%Y').dt.day
In [13]:
train_df['Journy_Month'] = pd.to_datetime(train_df.Date_of_Journey, format='%d/%m/%Y').dt.month
In [14]:
train_df.head()
Out[14]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price Journy_Day Journy_Month
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897 24 3
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662 1 5
2 Jet Airways 9/06/2019 Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882 9 6
3 IndiGo 12/05/2019 Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218 12 5
4 IndiGo 01/03/2019 Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302 1 3
In [15]:
train_df.drop(['Date_of_Journey'], axis=1, inplace=True)
In [16]:
train_df.head()
Out[16]:
Airline Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info Price Journy_Day Journy_Month
0 IndiGo Banglore New Delhi BLR → DEL 22:20 01:10 22 Mar 2h 50m non-stop No info 3897 24 3
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662 1 5
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 09:25 04:25 10 Jun 19h 2 stops No info 13882 9 6
3 IndiGo Kolkata Banglore CCU → NAG → BLR 18:05 23:30 5h 25m 1 stop No info 6218 12 5
4 IndiGo Banglore New Delhi BLR → NAG → DEL 16:50 21:35 4h 45m 1 stop No info 13302 1 3
In [17]:
## Departure time is when the plane leaves the gate

train_df['Dep_hour'] = pd.to_datetime(train_df.Dep_Time).dt.hour
train_df['Dep_min'] = pd.to_datetime(train_df.Dep_Time).dt.minute

train_df.drop(['Dep_Time'], axis=1, inplace=True)
In [18]:
train_df.head()
Out[18]:
Airline Source Destination Route Arrival_Time Duration Total_Stops Additional_Info Price Journy_Day Journy_Month Dep_hour Dep_min
0 IndiGo Banglore New Delhi BLR → DEL 01:10 22 Mar 2h 50m non-stop No info 3897 24 3 22 20
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 13:15 7h 25m 2 stops No info 7662 1 5 5 50
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 04:25 10 Jun 19h 2 stops No info 13882 9 6 9 25
3 IndiGo Kolkata Banglore CCU → NAG → BLR 23:30 5h 25m 1 stop No info 6218 12 5 18 5
4 IndiGo Banglore New Delhi BLR → NAG → DEL 21:35 4h 45m 1 stop No info 13302 1 3 16 50
In [19]:
## Arrival time is when the plane pulls up to the gate

train_df['Arrival_hour'] = pd.to_datetime(train_df.Arrival_Time).dt.hour
train_df['Arrival_min'] = pd.to_datetime(train_df.Arrival_Time).dt.minute

train_df.drop(['Arrival_Time'], axis=1, inplace=True)
In [20]:
train_df.head()
Out[20]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min
0 IndiGo Banglore New Delhi BLR → DEL 2h 50m non-stop No info 3897 24 3 22 20 1 10
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 7h 25m 2 stops No info 7662 1 5 5 50 13 15
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 19h 2 stops No info 13882 9 6 9 25 4 25
3 IndiGo Kolkata Banglore CCU → NAG → BLR 5h 25m 1 stop No info 6218 12 5 18 5 23 30
4 IndiGo Banglore New Delhi BLR → NAG → DEL 4h 45m 1 stop No info 13302 1 3 16 50 21 35
In [21]:
## Time taken to reach destination is called Duration
## It is difference between Departure Time and Arrival Time

## Assigning and converting Duration column into list
duration = list(train_df.Duration)

for i in range(len(duration)):
    if len(duration[i].split()) != 2:                       # Check if duration contains only hour or mins
        if 'h' in duration[i]:
            duration[i] = duration[i].strip() + " 0m"  # Adding 0 mins
        else:
            duration[i] = "0h " + duration[i]


duration_hours = []
duration_mins = []
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split("h")[0]))    # Extract hours from Duration
    duration_mins.append(int(duration[i].split('m')[0].split()[-1]))  # Extract onlye minutes from Duration
    
In [22]:
train_df["Duration_hours"] = duration_hours
train_df["Duration_mins"] = duration_mins
In [23]:
train_df.tail()
Out[23]:
Airline Source Destination Route Duration Total_Stops Additional_Info Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins
10678 Air Asia Kolkata Banglore CCU → BLR 2h 30m non-stop No info 4107 9 4 19 55 22 25 2 30
10679 Air India Kolkata Banglore CCU → BLR 2h 35m non-stop No info 4145 27 4 20 45 23 20 2 35
10680 Jet Airways Banglore Delhi BLR → DEL 3h non-stop No info 7229 27 4 8 20 11 20 3 0
10681 Vistara Banglore New Delhi BLR → DEL 2h 40m non-stop No info 12648 1 3 11 30 14 10 2 40
10682 Air India Delhi Cochin DEL → GOI → BOM → COK 8h 20m 2 stops No info 11753 9 5 10 55 19 15 8 20
In [24]:
train_df.drop(['Duration'], axis=1, inplace=True)
In [25]:
train_df.head()
Out[25]:
Airline Source Destination Route Total_Stops Additional_Info Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins
0 IndiGo Banglore New Delhi BLR → DEL non-stop No info 3897 24 3 22 20 1 10 2 50
1 Air India Kolkata Banglore CCU → IXR → BBI → BLR 2 stops No info 7662 1 5 5 50 13 15 7 25
2 Jet Airways Delhi Cochin DEL → LKO → BOM → COK 2 stops No info 13882 9 6 9 25 4 25 19 0
3 IndiGo Kolkata Banglore CCU → NAG → BLR 1 stop No info 6218 12 5 18 5 23 30 5 25
4 IndiGo Banglore New Delhi BLR → NAG → DEL 1 stop No info 13302 1 3 16 50 21 35 4 45

Handling Categorical Data¶

In [26]:
train_df.Airline.value_counts()
Out[26]:
Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64
In [27]:
sns.catplot(y="Price", x="Airline", data=train_df.sort_values("Price", ascending=False), kind='boxen', height=6, aspect=3)
Out[27]:
<seaborn.axisgrid.FacetGrid at 0x137168d30>
In [28]:
len(train_df['Airline'].unique())
Out[28]:
12
In [29]:
Airline = train_df[['Airline']]
Airline = pd.get_dummies(Airline, drop_first=True)
Airline.head()
Out[29]:
Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet Airline_Vistara Airline_Vistara Premium economy
0 0 0 1 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 0
2 0 0 0 1 0 0 0 0 0 0 0
3 0 0 1 0 0 0 0 0 0 0 0
4 0 0 1 0 0 0 0 0 0 0 0
In [30]:
train_df.Source.value_counts()
Out[30]:
Delhi       4536
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64
In [31]:
sns.catplot(y='Price', x='Source', data=train_df.sort_values('Price', ascending=False), kind='boxen', height=6, aspect=3)
Out[31]:
<seaborn.axisgrid.FacetGrid at 0x1370b8be0>
In [32]:
Source = train_df[['Source']]
Source = pd.get_dummies(Source, drop_first=True)
Source.head()
Out[32]:
Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai
0 0 0 0 0
1 0 0 1 0
2 0 1 0 0
3 0 0 1 0
4 0 0 0 0
In [33]:
Destination = train_df[['Destination']]
Destination = pd.get_dummies(Destination, drop_first=True)
Destination.head()
Out[33]:
Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 0 0 0 0 1
1 0 0 0 0 0
2 1 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 1
In [34]:
train_df.Route
Out[34]:
0                    BLR → DEL
1        CCU → IXR → BBI → BLR
2        DEL → LKO → BOM → COK
3              CCU → NAG → BLR
4              BLR → NAG → DEL
                 ...          
10678                CCU → BLR
10679                CCU → BLR
10680                BLR → DEL
10681                BLR → DEL
10682    DEL → GOI → BOM → COK
Name: Route, Length: 10682, dtype: object
In [35]:
a = train_df.Additional_Info=='No info'
In [36]:
a.mean()
Out[36]:
0.781127129750983
In [37]:
## Additional_Info conatins almost 80% no_info
# Route and Total_Stops are related to each other

train_df.drop(['Route', 'Additional_Info'], axis=1, inplace=True)
In [38]:
train_df.head()
Out[38]:
Airline Source Destination Total_Stops Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins
0 IndiGo Banglore New Delhi non-stop 3897 24 3 22 20 1 10 2 50
1 Air India Kolkata Banglore 2 stops 7662 1 5 5 50 13 15 7 25
2 Jet Airways Delhi Cochin 2 stops 13882 9 6 9 25 4 25 19 0
3 IndiGo Kolkata Banglore 1 stop 6218 12 5 18 5 23 30 5 25
4 IndiGo Banglore New Delhi 1 stop 13302 1 3 16 50 21 35 4 45
In [39]:
train_df.Total_Stops.value_counts()
Out[39]:
1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64
In [40]:
## As this is case of Ordinal Categorical type we perform LabelEncoder
## Here vlaues are assigned with corresponding keys
train_df.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace=True)
In [41]:
# train_df.head()
In [42]:
data_train = pd.concat([train_df, Airline, Source, Destination], axis=1)
In [43]:
data_train.head()
Out[43]:
Airline Source Destination Total_Stops Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet Airline_Vistara Airline_Vistara Premium economy Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 IndiGo Banglore New Delhi 0 3897 24 3 22 20 1 10 2 50 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
1 Air India Kolkata Banglore 2 7662 1 5 5 50 13 15 7 25 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 Jet Airways Delhi Cochin 2 13882 9 6 9 25 4 25 19 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 IndiGo Kolkata Banglore 1 6218 12 5 18 5 23 30 5 25 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
4 IndiGo Banglore New Delhi 1 13302 1 3 16 50 21 35 4 45 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
In [44]:
data_train.drop(['Destination', 'Source', 'Airline'], axis=1, inplace=True)
In [45]:
data_train.head()
Out[45]:
Total_Stops Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet Airline_Vistara Airline_Vistara Premium economy Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 0 3897 24 3 22 20 1 10 2 50 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
1 2 7662 1 5 5 50 13 15 7 25 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 2 13882 9 6 9 25 4 25 19 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 1 6218 12 5 18 5 23 30 5 25 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
4 1 13302 1 3 16 50 21 35 4 45 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
In [46]:
data_train.shape
Out[46]:
(10682, 30)

Test Set¶

In [47]:
test_data = pd.read_excel('Test_set.xlsx')
In [48]:
test_data.head()
Out[48]:
Airline Date_of_Journey Source Destination Route Dep_Time Arrival_Time Duration Total_Stops Additional_Info
0 Jet Airways 6/06/2019 Delhi Cochin DEL → BOM → COK 17:30 04:25 07 Jun 10h 55m 1 stop No info
1 IndiGo 12/05/2019 Kolkata Banglore CCU → MAA → BLR 06:20 10:20 4h 1 stop No info
2 Jet Airways 21/05/2019 Delhi Cochin DEL → BOM → COK 19:15 19:00 22 May 23h 45m 1 stop In-flight meal not included
3 Multiple carriers 21/05/2019 Delhi Cochin DEL → BOM → COK 08:00 21:00 13h 1 stop No info
4 Air Asia 24/06/2019 Banglore Delhi BLR → DEL 23:55 02:45 25 Jun 2h 50m non-stop No info
In [49]:
test_data.shape
Out[49]:
(2671, 10)
In [50]:
print("Test data Info")
print("-"*75)
print(test_data.info())

print()
print()

print("Null values :")
print("-"*75)
test_data.dropna(inplace = True)
print(test_data.isnull().sum())

# EDA

# Date_of_Journey
test_data["Journey_day"] = pd.to_datetime(test_data.Date_of_Journey, format="%d/%m/%Y").dt.day
test_data["Journey_month"] = pd.to_datetime(test_data["Date_of_Journey"], format = "%d/%m/%Y").dt.month
test_data.drop(["Date_of_Journey"], axis = 1, inplace = True)

# Dep_Time
test_data["Dep_hour"] = pd.to_datetime(test_data["Dep_Time"]).dt.hour
test_data["Dep_min"] = pd.to_datetime(test_data["Dep_Time"]).dt.minute
test_data.drop(["Dep_Time"], axis = 1, inplace = True)

# Arrival_Time
test_data["Arrival_hour"] = pd.to_datetime(test_data.Arrival_Time).dt.hour
test_data["Arrival_min"] = pd.to_datetime(test_data.Arrival_Time).dt.minute
test_data.drop(["Arrival_Time"], axis = 1, inplace = True)

# Duration
duration = list(test_data["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
        else:
            duration[i] = "0h " + duration[i]           # Adds 0 hour

duration_hours = []
duration_mins = []
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    # Extract hours from duration
    duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))   # Extracts only minutes from duration

# Adding Duration column to test set
test_data["Duration_hours"] = duration_hours
test_data["Duration_mins"] = duration_mins
test_data.drop(["Duration"], axis = 1, inplace = True)


# Categorical data

print("Airline")
print("-"*75)
print(test_data["Airline"].value_counts())
Airline = pd.get_dummies(test_data["Airline"], drop_first= True)

print()

print("Source")
print("-"*75)
print(test_data["Source"].value_counts())
Source = pd.get_dummies(test_data["Source"], drop_first= True)

print()

print("Destination")
print("-"*75)
print(test_data["Destination"].value_counts())
Destination = pd.get_dummies(test_data["Destination"], drop_first = True)

# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other
test_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)

# Replacing Total_Stops
test_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

# Concatenate dataframe --> test_data + Airline + Source + Destination
data_test = pd.concat([test_data, Airline, Source, Destination], axis = 1)

data_test.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

print()
print()

print("Shape of test data : ", data_test.shape)
Test data Info
---------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          2671 non-null   object
 1   Date_of_Journey  2671 non-null   object
 2   Source           2671 non-null   object
 3   Destination      2671 non-null   object
 4   Route            2671 non-null   object
 5   Dep_Time         2671 non-null   object
 6   Arrival_Time     2671 non-null   object
 7   Duration         2671 non-null   object
 8   Total_Stops      2671 non-null   object
 9   Additional_Info  2671 non-null   object
dtypes: object(10)
memory usage: 208.8+ KB
None


Null values :
---------------------------------------------------------------------------
Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
dtype: int64
Airline
---------------------------------------------------------------------------
Jet Airways                          897
IndiGo                               511
Air India                            440
Multiple carriers                    347
SpiceJet                             208
Vistara                              129
Air Asia                              86
GoAir                                 46
Multiple carriers Premium economy      3
Vistara Premium economy                2
Jet Airways Business                   2
Name: Airline, dtype: int64

Source
---------------------------------------------------------------------------
Delhi       1145
Kolkata      710
Banglore     555
Mumbai       186
Chennai       75
Name: Source, dtype: int64

Destination
---------------------------------------------------------------------------
Cochin       1145
Banglore      710
Delhi         317
New Delhi     238
Hyderabad     186
Kolkata        75
Name: Destination, dtype: int64


Shape of test data :  (2671, 28)
In [51]:
data_test.shape
Out[51]:
(2671, 28)
In [52]:
data_test.head()
Out[52]:
Total_Stops Journey_day Journey_month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins Air India GoAir IndiGo Jet Airways Jet Airways Business Multiple carriers Multiple carriers Premium economy SpiceJet Vistara Vistara Premium economy Chennai Delhi Kolkata Mumbai Cochin Delhi Hyderabad Kolkata New Delhi
0 1 6 6 17 30 4 25 10 55 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
1 1 12 5 6 20 10 20 4 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 1 21 5 19 15 19 0 23 45 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 1 21 5 8 0 21 0 13 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0
4 0 24 6 23 55 2 45 2 50 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0

Feature Selection¶

In [53]:
data_train.head()
Out[53]:
Total_Stops Price Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet Airline_Vistara Airline_Vistara Premium economy Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 0 3897 24 3 22 20 1 10 2 50 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
1 2 7662 1 5 5 50 13 15 7 25 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 2 13882 9 6 9 25 4 25 19 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 1 6218 12 5 18 5 23 30 5 25 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
4 1 13302 1 3 16 50 21 35 4 45 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
In [54]:
data_train.columns
Out[54]:
Index(['Total_Stops', 'Price', 'Journy_Day', 'Journy_Month', 'Dep_hour',
       'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours',
       'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi'],
      dtype='object')
In [55]:
X = data_train.loc[:, ['Total_Stops', 'Journy_Day', 'Journy_Month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 
                      'Duration_hours', 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
                      'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers',
                      'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet', 'Airline_Trujet',
                      'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
                      'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
                      'Destination_New Delhi']]
In [56]:
X.head()
Out[56]:
Total_Stops Journy_Day Journy_Month Dep_hour Dep_min Arrival_hour Duration_hours Duration_mins Airline_Air India Airline_GoAir Airline_IndiGo Airline_Jet Airways Airline_Jet Airways Business Airline_Multiple carriers Airline_Multiple carriers Premium economy Airline_SpiceJet Airline_Trujet Airline_Vistara Airline_Vistara Premium economy Source_Chennai Source_Delhi Source_Kolkata Source_Mumbai Destination_Cochin Destination_Delhi Destination_Hyderabad Destination_Kolkata Destination_New Delhi
0 0 24 3 22 20 1 2 50 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
1 2 1 5 5 50 13 7 25 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 2 9 6 9 25 4 19 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 1 12 5 18 5 23 5 25 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
4 1 1 3 16 50 21 4 45 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
In [57]:
y = data_train.iloc[:, 1]
y.head()
Out[57]:
0     3897
1     7662
2    13882
3     6218
4    13302
Name: Price, dtype: int64
In [58]:
plt.figure(figsize=(18, 18))
sns.heatmap(train_df.corr(), annot=True, cmap='RdYlGn')
plt.show()
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_196/432270364.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(train_df.corr(), annot=True, cmap='RdYlGn')
In [59]:
# Important feature using ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)
Out[59]:
ExtraTreesRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ExtraTreesRegressor()
In [60]:
print(selection.feature_importances_)
[2.16281287e-01 1.43561114e-01 5.53275544e-02 2.68252534e-02
 2.46186722e-02 3.23170611e-02 1.29350477e-01 2.16917730e-02
 1.14972302e-02 1.97152005e-03 1.85747928e-02 1.38611825e-01
 6.79090519e-02 1.81502325e-02 8.97371777e-04 3.24599248e-03
 1.12508503e-04 4.97202809e-03 8.22143694e-05 6.20273501e-04
 1.48915815e-02 3.36118903e-03 6.09060533e-03 9.23108677e-03
 1.63018973e-02 8.03502905e-03 5.39686688e-04 2.49306908e-02]
In [61]:
plt.figure(figsize=(12, 8))
feature_imp = pd.Series(selection.feature_importances_, index=X.columns)
feature_imp.nlargest(20).plot(kind='barh')
plt.show()

Training part¶

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [63]:
from sklearn.ensemble import RandomForestRegressor
In [64]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
Out[64]:
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [65]:
y_pred = rf_model.predict(X_test)
In [66]:
rf_model.score(X_train, y_train)
Out[66]:
0.9524462409365866
In [67]:
rf_model.score(X_test, y_test)
Out[67]:
0.7975464372137973
In [68]:
sns.distplot(y_test - y_pred)
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_196/2332411778.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(y_test - y_pred)
Out[68]:
<AxesSubplot:xlabel='Price', ylabel='Density'>
In [69]:
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('y_test')
plt.ylabel('y_label')
plt.show()
In [70]:
from sklearn import metrics
In [71]:
print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))
print("MSE: ", metrics.mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE:  1179.374780326833
MSE:  4365315.123823327
RMSE:  2089.3336554565253
In [72]:
metrics.r2_score(y_test, y_pred)
Out[72]:
0.7975464372137973

Hyperparameter Tuning¶

In [73]:
from sklearn.model_selection import RandomizedSearchCV
In [74]:
# Randomized Search CV

## Number of trees in ramdom forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
## Number of features to consider at every split
max_features = ['auto', 'sqrt']
## Maximum number of level in tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
## Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
## Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
In [75]:
## create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)
{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}
In [76]:
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=random_grid, scoring='neg_mean_squared_error',
                               n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)
In [77]:
rf_random.fit(X_train, y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   2.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   3.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   3.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   3.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   3.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   3.1s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time=   2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time=   2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time=   2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time=   2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time=   2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   3.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   3.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time=   5.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time=   5.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time=   5.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time=   5.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time=   5.5s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.2s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.2s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.1s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.1s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time=   1.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time=   1.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time=   1.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time=   1.7s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time=   1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time=   1.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time=   1.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time=   1.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time=   1.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time=   1.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time=   6.8s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time=   6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time=   6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time=   6.8s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time=   6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors.
  warn(
Out[77]:
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)
RandomForestRegressor()
RandomForestRegressor()
In [78]:
rf_random.best_params_
Out[78]:
{'n_estimators': 700,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 20}
In [79]:
y_pred = rf_random.predict(X_test)
In [80]:
plt.figure(figsize=(8, 8))
sns.displot(y_test-y_pred)
plt.show()
<Figure size 800x800 with 0 Axes>
In [81]:
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('y_test')
plt.ylabel('y_label')
plt.show()
In [82]:
print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))
print("MSE: ", metrics.mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE:  1168.0038153543371
MSE:  4038348.193369491
RMSE:  2009.5641799578063
In [83]:
metrics.r2_score(y_test, y_pred)
Out[83]:
0.812710432963472
In [84]:
import pickle
In [85]:
file = open('model.pkl', 'wb')

pickle.dump(rf_random, file)
In [86]:
model = pickle.load(open('model.pkl', 'rb'))
In [87]:
y_pred = model.predict(X_test)
In [88]:
metrics.r2_score(y_test, y_pred)
Out[88]:
0.812710432963472
In [89]:
data_test.head()
Out[89]:
Total_Stops Journey_day Journey_month Dep_hour Dep_min Arrival_hour Arrival_min Duration_hours Duration_mins Air India GoAir IndiGo Jet Airways Jet Airways Business Multiple carriers Multiple carriers Premium economy SpiceJet Vistara Vistara Premium economy Chennai Delhi Kolkata Mumbai Cochin Delhi Hyderabad Kolkata New Delhi
0 1 6 6 17 30 4 25 10 55 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
1 1 12 5 6 20 10 20 4 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 1 21 5 19 15 19 0 23 45 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
3 1 21 5 8 0 21 0 13 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0
4 0 24 6 23 55 2 45 2 50 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
In [ ]: