import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_df = pd.read_excel('Data_Train.xlsx')
pd.set_option('display.max_columns', None)
train_df.head()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
train_df.shape
(10683, 11)
type(train_df)
pandas.core.frame.DataFrame
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: int64(1), object(10) memory usage: 918.2+ KB
train_df.Duration.value_counts()
2h 50m 550 1h 30m 386 2h 45m 337 2h 55m 337 2h 35m 329 ... 31h 30m 1 30h 25m 1 42h 5m 1 4h 10m 1 47h 40m 1 Name: Duration, Length: 368, dtype: int64
train_df.isna().sum()
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 1 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 1 Additional_Info 0 Price 0 dtype: int64
train_df.dropna(inplace=True)
train_df.isna().sum()
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 0 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 0 Additional_Info 0 Price 0 dtype: int64
train_df['Journy_Day'] = pd.to_datetime(train_df.Date_of_Journey, format='%d/%m/%Y').dt.day
train_df['Journy_Month'] = pd.to_datetime(train_df.Date_of_Journey, format='%d/%m/%Y').dt.month
train_df.head()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 |
1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 |
2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 |
3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 |
4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 |
train_df.drop(['Date_of_Journey'], axis=1, inplace=True)
train_df.head()
Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 |
## Departure time is when the plane leaves the gate
train_df['Dep_hour'] = pd.to_datetime(train_df.Dep_Time).dt.hour
train_df['Dep_min'] = pd.to_datetime(train_df.Dep_Time).dt.minute
train_df.drop(['Dep_Time'], axis=1, inplace=True)
train_df.head()
Airline | Source | Destination | Route | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 22 | 20 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 |
## Arrival time is when the plane pulls up to the gate
train_df['Arrival_hour'] = pd.to_datetime(train_df.Arrival_Time).dt.hour
train_df['Arrival_min'] = pd.to_datetime(train_df.Arrival_Time).dt.minute
train_df.drop(['Arrival_Time'], axis=1, inplace=True)
train_df.head()
Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 22 | 20 | 1 | 10 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 | 13 | 15 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 19h | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 | 4 | 25 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 | 23 | 30 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 | 21 | 35 |
## Time taken to reach destination is called Duration
## It is difference between Departure Time and Arrival Time
## Assigning and converting Duration column into list
duration = list(train_df.Duration)
for i in range(len(duration)):
if len(duration[i].split()) != 2: # Check if duration contains only hour or mins
if 'h' in duration[i]:
duration[i] = duration[i].strip() + " 0m" # Adding 0 mins
else:
duration[i] = "0h " + duration[i]
duration_hours = []
duration_mins = []
for i in range(len(duration)):
duration_hours.append(int(duration[i].split("h")[0])) # Extract hours from Duration
duration_mins.append(int(duration[i].split('m')[0].split()[-1])) # Extract onlye minutes from Duration
train_df["Duration_hours"] = duration_hours
train_df["Duration_mins"] = duration_mins
train_df.tail()
Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10678 | Air Asia | Kolkata | Banglore | CCU → BLR | 2h 30m | non-stop | No info | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 |
10679 | Air India | Kolkata | Banglore | CCU → BLR | 2h 35m | non-stop | No info | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 |
10680 | Jet Airways | Banglore | Delhi | BLR → DEL | 3h | non-stop | No info | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 |
10681 | Vistara | Banglore | New Delhi | BLR → DEL | 2h 40m | non-stop | No info | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 |
10682 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 8h 20m | 2 stops | No info | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 |
train_df.drop(['Duration'], axis=1, inplace=True)
train_df.head()
Airline | Source | Destination | Route | Total_Stops | Additional_Info | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | BLR → DEL | non-stop | No info | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
train_df.Airline.value_counts()
Jet Airways 3849 IndiGo 2053 Air India 1751 Multiple carriers 1196 SpiceJet 818 Vistara 479 Air Asia 319 GoAir 194 Multiple carriers Premium economy 13 Jet Airways Business 6 Vistara Premium economy 3 Trujet 1 Name: Airline, dtype: int64
sns.catplot(y="Price", x="Airline", data=train_df.sort_values("Price", ascending=False), kind='boxen', height=6, aspect=3)
<seaborn.axisgrid.FacetGrid at 0x137168d30>
len(train_df['Airline'].unique())
12
Airline = train_df[['Airline']]
Airline = pd.get_dummies(Airline, drop_first=True)
Airline.head()
Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
train_df.Source.value_counts()
Delhi 4536 Kolkata 2871 Banglore 2197 Mumbai 697 Chennai 381 Name: Source, dtype: int64
sns.catplot(y='Price', x='Source', data=train_df.sort_values('Price', ascending=False), kind='boxen', height=6, aspect=3)
<seaborn.axisgrid.FacetGrid at 0x1370b8be0>
Source = train_df[['Source']]
Source = pd.get_dummies(Source, drop_first=True)
Source.head()
Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 1 | 0 |
2 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 1 | 0 |
4 | 0 | 0 | 0 | 0 |
Destination = train_df[['Destination']]
Destination = pd.get_dummies(Destination, drop_first=True)
Destination.head()
Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 |
train_df.Route
0 BLR → DEL 1 CCU → IXR → BBI → BLR 2 DEL → LKO → BOM → COK 3 CCU → NAG → BLR 4 BLR → NAG → DEL ... 10678 CCU → BLR 10679 CCU → BLR 10680 BLR → DEL 10681 BLR → DEL 10682 DEL → GOI → BOM → COK Name: Route, Length: 10682, dtype: object
a = train_df.Additional_Info=='No info'
a.mean()
0.781127129750983
## Additional_Info conatins almost 80% no_info
# Route and Total_Stops are related to each other
train_df.drop(['Route', 'Additional_Info'], axis=1, inplace=True)
train_df.head()
Airline | Source | Destination | Total_Stops | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | non-stop | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
1 | Air India | Kolkata | Banglore | 2 stops | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
2 | Jet Airways | Delhi | Cochin | 2 stops | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
3 | IndiGo | Kolkata | Banglore | 1 stop | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
4 | IndiGo | Banglore | New Delhi | 1 stop | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
train_df.Total_Stops.value_counts()
1 stop 5625 non-stop 3491 2 stops 1520 3 stops 45 4 stops 1 Name: Total_Stops, dtype: int64
## As this is case of Ordinal Categorical type we perform LabelEncoder
## Here vlaues are assigned with corresponding keys
train_df.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace=True)
# train_df.head()
data_train = pd.concat([train_df, Airline, Source, Destination], axis=1)
data_train.head()
Airline | Source | Destination | Total_Stops | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | IndiGo | Banglore | New Delhi | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | Air India | Kolkata | Banglore | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | Jet Airways | Delhi | Cochin | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | IndiGo | Kolkata | Banglore | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | IndiGo | Banglore | New Delhi | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
data_train.drop(['Destination', 'Source', 'Airline'], axis=1, inplace=True)
data_train.head()
Total_Stops | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
data_train.shape
(10682, 30)
test_data = pd.read_excel('Test_set.xlsx')
test_data.head()
Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Jet Airways | 6/06/2019 | Delhi | Cochin | DEL → BOM → COK | 17:30 | 04:25 07 Jun | 10h 55m | 1 stop | No info |
1 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → MAA → BLR | 06:20 | 10:20 | 4h | 1 stop | No info |
2 | Jet Airways | 21/05/2019 | Delhi | Cochin | DEL → BOM → COK | 19:15 | 19:00 22 May | 23h 45m | 1 stop | In-flight meal not included |
3 | Multiple carriers | 21/05/2019 | Delhi | Cochin | DEL → BOM → COK | 08:00 | 21:00 | 13h | 1 stop | No info |
4 | Air Asia | 24/06/2019 | Banglore | Delhi | BLR → DEL | 23:55 | 02:45 25 Jun | 2h 50m | non-stop | No info |
test_data.shape
(2671, 10)
print("Test data Info")
print("-"*75)
print(test_data.info())
print()
print()
print("Null values :")
print("-"*75)
test_data.dropna(inplace = True)
print(test_data.isnull().sum())
# EDA
# Date_of_Journey
test_data["Journey_day"] = pd.to_datetime(test_data.Date_of_Journey, format="%d/%m/%Y").dt.day
test_data["Journey_month"] = pd.to_datetime(test_data["Date_of_Journey"], format = "%d/%m/%Y").dt.month
test_data.drop(["Date_of_Journey"], axis = 1, inplace = True)
# Dep_Time
test_data["Dep_hour"] = pd.to_datetime(test_data["Dep_Time"]).dt.hour
test_data["Dep_min"] = pd.to_datetime(test_data["Dep_Time"]).dt.minute
test_data.drop(["Dep_Time"], axis = 1, inplace = True)
# Arrival_Time
test_data["Arrival_hour"] = pd.to_datetime(test_data.Arrival_Time).dt.hour
test_data["Arrival_min"] = pd.to_datetime(test_data.Arrival_Time).dt.minute
test_data.drop(["Arrival_Time"], axis = 1, inplace = True)
# Duration
duration = list(test_data["Duration"])
for i in range(len(duration)):
if len(duration[i].split()) != 2: # Check if duration contains only hour or mins
if "h" in duration[i]:
duration[i] = duration[i].strip() + " 0m" # Adds 0 minute
else:
duration[i] = "0h " + duration[i] # Adds 0 hour
duration_hours = []
duration_mins = []
for i in range(len(duration)):
duration_hours.append(int(duration[i].split(sep = "h")[0])) # Extract hours from duration
duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1])) # Extracts only minutes from duration
# Adding Duration column to test set
test_data["Duration_hours"] = duration_hours
test_data["Duration_mins"] = duration_mins
test_data.drop(["Duration"], axis = 1, inplace = True)
# Categorical data
print("Airline")
print("-"*75)
print(test_data["Airline"].value_counts())
Airline = pd.get_dummies(test_data["Airline"], drop_first= True)
print()
print("Source")
print("-"*75)
print(test_data["Source"].value_counts())
Source = pd.get_dummies(test_data["Source"], drop_first= True)
print()
print("Destination")
print("-"*75)
print(test_data["Destination"].value_counts())
Destination = pd.get_dummies(test_data["Destination"], drop_first = True)
# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other
test_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)
# Replacing Total_Stops
test_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)
# Concatenate dataframe --> test_data + Airline + Source + Destination
data_test = pd.concat([test_data, Airline, Source, Destination], axis = 1)
data_test.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)
print()
print()
print("Shape of test data : ", data_test.shape)
Test data Info --------------------------------------------------------------------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 2671 entries, 0 to 2670 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 2671 non-null object 1 Date_of_Journey 2671 non-null object 2 Source 2671 non-null object 3 Destination 2671 non-null object 4 Route 2671 non-null object 5 Dep_Time 2671 non-null object 6 Arrival_Time 2671 non-null object 7 Duration 2671 non-null object 8 Total_Stops 2671 non-null object 9 Additional_Info 2671 non-null object dtypes: object(10) memory usage: 208.8+ KB None Null values : --------------------------------------------------------------------------- Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 0 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 0 Additional_Info 0 dtype: int64 Airline --------------------------------------------------------------------------- Jet Airways 897 IndiGo 511 Air India 440 Multiple carriers 347 SpiceJet 208 Vistara 129 Air Asia 86 GoAir 46 Multiple carriers Premium economy 3 Vistara Premium economy 2 Jet Airways Business 2 Name: Airline, dtype: int64 Source --------------------------------------------------------------------------- Delhi 1145 Kolkata 710 Banglore 555 Mumbai 186 Chennai 75 Name: Source, dtype: int64 Destination --------------------------------------------------------------------------- Cochin 1145 Banglore 710 Delhi 317 New Delhi 238 Hyderabad 186 Kolkata 75 Name: Destination, dtype: int64 Shape of test data : (2671, 28)
data_test.shape
(2671, 28)
data_test.head()
Total_Stops | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Air India | GoAir | IndiGo | Jet Airways | Jet Airways Business | Multiple carriers | Multiple carriers Premium economy | SpiceJet | Vistara | Vistara Premium economy | Chennai | Delhi | Kolkata | Mumbai | Cochin | Delhi | Hyderabad | Kolkata | New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 6 | 6 | 17 | 30 | 4 | 25 | 10 | 55 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 12 | 5 | 6 | 20 | 10 | 20 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 21 | 5 | 19 | 15 | 19 | 0 | 23 | 45 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 21 | 5 | 8 | 0 | 21 | 0 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4 | 0 | 24 | 6 | 23 | 55 | 2 | 45 | 2 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
data_train.head()
Total_Stops | Price | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
data_train.columns
Index(['Total_Stops', 'Price', 'Journy_Day', 'Journy_Month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers', 'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet', 'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata', 'Destination_New Delhi'], dtype='object')
X = data_train.loc[:, ['Total_Stops', 'Journy_Day', 'Journy_Month', 'Dep_hour', 'Dep_min', 'Arrival_hour',
'Duration_hours', 'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers',
'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet', 'Airline_Trujet',
'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
'Destination_New Delhi']]
X.head()
Total_Stops | Journy_Day | Journy_Month | Dep_hour | Dep_min | Arrival_hour | Duration_hours | Duration_mins | Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 24 | 3 | 22 | 20 | 1 | 2 | 50 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 1 | 5 | 5 | 50 | 13 | 7 | 25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2 | 9 | 6 | 9 | 25 | 4 | 19 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 12 | 5 | 18 | 5 | 23 | 5 | 25 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 1 | 3 | 16 | 50 | 21 | 4 | 45 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
y = data_train.iloc[:, 1]
y.head()
0 3897 1 7662 2 13882 3 6218 4 13302 Name: Price, dtype: int64
plt.figure(figsize=(18, 18))
sns.heatmap(train_df.corr(), annot=True, cmap='RdYlGn')
plt.show()
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_196/432270364.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(train_df.corr(), annot=True, cmap='RdYlGn')
# Important feature using ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)
ExtraTreesRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ExtraTreesRegressor()
print(selection.feature_importances_)
[2.16281287e-01 1.43561114e-01 5.53275544e-02 2.68252534e-02 2.46186722e-02 3.23170611e-02 1.29350477e-01 2.16917730e-02 1.14972302e-02 1.97152005e-03 1.85747928e-02 1.38611825e-01 6.79090519e-02 1.81502325e-02 8.97371777e-04 3.24599248e-03 1.12508503e-04 4.97202809e-03 8.22143694e-05 6.20273501e-04 1.48915815e-02 3.36118903e-03 6.09060533e-03 9.23108677e-03 1.63018973e-02 8.03502905e-03 5.39686688e-04 2.49306908e-02]
plt.figure(figsize=(12, 8))
feature_imp = pd.Series(selection.feature_importances_, index=X.columns)
feature_imp.nlargest(20).plot(kind='barh')
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
y_pred = rf_model.predict(X_test)
rf_model.score(X_train, y_train)
0.9524462409365866
rf_model.score(X_test, y_test)
0.7975464372137973
sns.distplot(y_test - y_pred)
/var/folders/16/3468kndx5l1_zj5r84tsybgc0000gn/T/ipykernel_196/2332411778.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y_test - y_pred)
<AxesSubplot:xlabel='Price', ylabel='Density'>
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('y_test')
plt.ylabel('y_label')
plt.show()
from sklearn import metrics
print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))
print("MSE: ", metrics.mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE: 1179.374780326833 MSE: 4365315.123823327 RMSE: 2089.3336554565253
metrics.r2_score(y_test, y_pred)
0.7975464372137973
from sklearn.model_selection import RandomizedSearchCV
# Randomized Search CV
## Number of trees in ramdom forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
## Number of features to consider at every split
max_features = ['auto', 'sqrt']
## Maximum number of level in tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
## Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
## Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
## create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
print(random_grid)
{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=random_grid, scoring='neg_mean_squared_error',
n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)
rf_random.fit(X_train, y_train)
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 2.0s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 2.0s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 2.0s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 2.0s [CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time= 2.0s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 3.0s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 3.0s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 3.0s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 3.1s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time= 3.1s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=100, n_estimators=300; total time= 2.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 3.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 3.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time= 3.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 5.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 5.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 5.6s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 5.5s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=700; total time= 5.5s [CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 4.2s [CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 4.2s [CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 4.1s [CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 4.1s [CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time= 4.1s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 1.6s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 1.6s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 1.6s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 1.7s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=1100; total time= 1.6s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 0.8s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 0.8s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 0.8s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 0.8s [CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=300; total time= 0.8s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 1.1s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 1.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 1.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 1.0s [CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=700; total time= 1.0s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 6.8s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 6.8s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=15, n_estimators=700; total time= 6.7s
/Users/thomas/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:414: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1, param_distributions={'max_depth': [5, 10, 15, 20, 25, 30], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 5, 10], 'min_samples_split': [2, 5, 10, 15, 100], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]}, random_state=42, scoring='neg_mean_squared_error', verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1, param_distributions={'max_depth': [5, 10, 15, 20, 25, 30], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 5, 10], 'min_samples_split': [2, 5, 10, 15, 100], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]}, random_state=42, scoring='neg_mean_squared_error', verbose=2)
RandomForestRegressor()
RandomForestRegressor()
rf_random.best_params_
{'n_estimators': 700, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 20}
y_pred = rf_random.predict(X_test)
plt.figure(figsize=(8, 8))
sns.displot(y_test-y_pred)
plt.show()
<Figure size 800x800 with 0 Axes>
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('y_test')
plt.ylabel('y_label')
plt.show()
print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))
print("MSE: ", metrics.mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE: 1168.0038153543371 MSE: 4038348.193369491 RMSE: 2009.5641799578063
metrics.r2_score(y_test, y_pred)
0.812710432963472
import pickle
file = open('model.pkl', 'wb')
pickle.dump(rf_random, file)
model = pickle.load(open('model.pkl', 'rb'))
y_pred = model.predict(X_test)
metrics.r2_score(y_test, y_pred)
0.812710432963472
data_test.head()
Total_Stops | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Air India | GoAir | IndiGo | Jet Airways | Jet Airways Business | Multiple carriers | Multiple carriers Premium economy | SpiceJet | Vistara | Vistara Premium economy | Chennai | Delhi | Kolkata | Mumbai | Cochin | Delhi | Hyderabad | Kolkata | New Delhi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 6 | 6 | 17 | 30 | 4 | 25 | 10 | 55 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 12 | 5 | 6 | 20 | 10 | 20 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 21 | 5 | 19 | 15 | 19 | 0 | 23 | 45 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 1 | 21 | 5 | 8 | 0 | 21 | 0 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4 | 0 | 24 | 6 | 23 | 55 | 2 | 45 | 2 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |