Advanced Pandas Tutorial¶
In [2]:
# IPL 2024 — Pandas Cookbook
# Cell 1: Imports & options
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)
In [3]:
# Cell 2: Create sample dataset and save CSV
data = {
'match_id': [1,1,2,2,3,3,4,4,5,5],
'date': ['2024-03-22','2024-03-22','2024-03-23','2024-03-23','2024-03-24','2024-03-24','2024-03-25','2024-03-25','2024-03-26','2024-03-26'],
'team': ['MI','CSK','MI','SRH','RCB','CSK','KKR','RCB','SRH','KKR'],
'opponent': ['CSK','MI','SRH','MI','CSK','RCB','RCB','KKR','KKR','SRH'],
'venue': ['Mumbai','Mumbai','Hyderabad','Hyderabad','Bengaluru','Bengaluru','Kolkata','Kolkata','Hyderabad','Kolkata'],
'runs_scored': [180,165,150,160,175,170,140,155,162,158],
'runs_against': [165,180,160,150,170,175,155,140,158,162],
'result': ['win','loss','loss','win','win','loss','loss','win','win','loss'],
'player_of_match': ['Rohit','Raina','Jadeja','Kohli','Warner','Du Plessis','Russell','ABD','Bhuvi','Narine'],
'attendance': [30000,30000,25000,25000,35000,35000,40000,40000,22000,40000]
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df.to_csv('ipl2024_sample.csv', index=False)
df.head()
Out[3]:
| match_id | date | team | opponent | venue | runs_scored | runs_against | result | player_of_match | attendance | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-03-22 | MI | CSK | Mumbai | 180 | 165 | win | Rohit | 30000 |
| 1 | 1 | 2024-03-22 | CSK | MI | Mumbai | 165 | 180 | loss | Raina | 30000 |
| 2 | 2 | 2024-03-23 | MI | SRH | Hyderabad | 150 | 160 | loss | Jadeja | 25000 |
| 3 | 2 | 2024-03-23 | SRH | MI | Hyderabad | 160 | 150 | win | Kohli | 25000 |
| 4 | 3 | 2024-03-24 | RCB | CSK | Bengaluru | 175 | 170 | win | Warner | 35000 |
In [4]:
# Cell 3: DataFrame operations
df = pd.read_csv('ipl2024_sample.csv', parse_dates=['date'])
print("Shape:", df.shape)
df.head()
df.describe(include='all')
Shape: (10, 10)
Out[4]:
| match_id | date | team | opponent | venue | runs_scored | runs_against | result | player_of_match | attendance | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10 | 10 | 10 | 10 | 10.000000 | 10.000000 | 10 | 10 | 10.000000 |
| unique | NaN | NaN | 5 | 5 | 4 | NaN | NaN | 2 | 10 | NaN |
| top | NaN | NaN | MI | CSK | Hyderabad | NaN | NaN | win | Rohit | NaN |
| freq | NaN | NaN | 2 | 2 | 3 | NaN | NaN | 5 | 1 | NaN |
| mean | 3.000000 | 2024-03-24 00:00:00 | NaN | NaN | NaN | 161.500000 | 161.500000 | NaN | NaN | 32200.000000 |
| min | 1.000000 | 2024-03-22 00:00:00 | NaN | NaN | NaN | 140.000000 | 140.000000 | NaN | NaN | 22000.000000 |
| 25% | 2.000000 | 2024-03-23 00:00:00 | NaN | NaN | NaN | 155.750000 | 155.750000 | NaN | NaN | 26250.000000 |
| 50% | 3.000000 | 2024-03-24 00:00:00 | NaN | NaN | NaN | 161.000000 | 161.000000 | NaN | NaN | 32500.000000 |
| 75% | 4.000000 | 2024-03-25 00:00:00 | NaN | NaN | NaN | 168.750000 | 168.750000 | NaN | NaN | 38750.000000 |
| max | 5.000000 | 2024-03-26 00:00:00 | NaN | NaN | NaN | 180.000000 | 180.000000 | NaN | NaN | 40000.000000 |
| std | 1.490712 | NaN | NaN | NaN | NaN | 11.834507 | 11.834507 | NaN | NaN | 6795.423296 |
In [5]:
# Cell 4: Indexing (set_index, loc, iloc, boolean)
df_idx = df.set_index(['match_id','team'])
df_idx.head()
# Example: df_idx.loc[(2,'MI')]
# Example: df.iloc[:2]
# Example: df[df['attendance'] > 30000]
Out[5]:
| date | opponent | venue | runs_scored | runs_against | result | player_of_match | attendance | ||
|---|---|---|---|---|---|---|---|---|---|
| match_id | team | ||||||||
| 1 | MI | 2024-03-22 | CSK | Mumbai | 180 | 165 | win | Rohit | 30000 |
| CSK | 2024-03-22 | MI | Mumbai | 165 | 180 | loss | Raina | 30000 | |
| 2 | MI | 2024-03-23 | SRH | Hyderabad | 150 | 160 | loss | Jadeja | 25000 |
| SRH | 2024-03-23 | MI | Hyderabad | 160 | 150 | win | Kohli | 25000 | |
| 3 | RCB | 2024-03-24 | CSK | Bengaluru | 175 | 170 | win | Warner | 35000 |
In [6]:
# Cell 5: Merging (team info)
team_info = pd.DataFrame({
'team': ['MI','CSK','SRH','RCB','KKR'],
'coach': ['Mahela','Stephen','Trevor','Laxman','Brendon'],
'captain': ['Rohit','Dhoni','Kane','Virat','Shreyas']
})
merged = df.merge(team_info, on='team', how='left')
merged.head()
Out[6]:
| match_id | date | team | opponent | venue | runs_scored | runs_against | result | player_of_match | attendance | coach | captain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-03-22 | MI | CSK | Mumbai | 180 | 165 | win | Rohit | 30000 | Mahela | Rohit |
| 1 | 1 | 2024-03-22 | CSK | MI | Mumbai | 165 | 180 | loss | Raina | 30000 | Stephen | Dhoni |
| 2 | 2 | 2024-03-23 | MI | SRH | Hyderabad | 150 | 160 | loss | Jadeja | 25000 | Mahela | Rohit |
| 3 | 2 | 2024-03-23 | SRH | MI | Hyderabad | 160 | 150 | win | Kohli | 25000 | Trevor | Kane |
| 4 | 3 | 2024-03-24 | RCB | CSK | Bengaluru | 175 | 170 | win | Warner | 35000 | Laxman | Virat |
In [7]:
# Cell 6: Grouping and aggregation
grouped = df.groupby('team').agg(
matches=('match_id','nunique'),
total_runs_scored=('runs_scored','sum'),
avg_attendance=('attendance','mean')
).reset_index()
grouped
Out[7]:
| team | matches | total_runs_scored | avg_attendance | |
|---|---|---|---|---|
| 0 | CSK | 2 | 335 | 32500.0 |
| 1 | KKR | 2 | 298 | 40000.0 |
| 2 | MI | 2 | 330 | 27500.0 |
| 3 | RCB | 2 | 330 | 37500.0 |
| 4 | SRH | 2 | 322 | 23500.0 |
In [8]:
# Cell 7: MultiIndex / pivot / stack / unstack
pivot = df.pivot_table(index=['team'], columns='result', values='runs_scored', aggfunc='count', fill_value=0)
pivot
Out[8]:
| result | loss | win |
|---|---|---|
| team | ||
| CSK | 2 | 0 |
| KKR | 2 | 0 |
| MI | 1 | 1 |
| RCB | 0 | 2 |
| SRH | 0 | 2 |
In [9]:
# Cell 8: Pivot table with multiple aggregations
pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'],
aggfunc={'runs_scored': np.mean, 'attendance': np.sum}, fill_value=0)
pv
/var/folders/43/0syn7psx5w1fdxg82fm99xpc0000gn/T/ipykernel_7000/2644901071.py:2: FutureWarning: The provided callable <function mean at 0x10520b2e0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead. pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'], /var/folders/43/0syn7psx5w1fdxg82fm99xpc0000gn/T/ipykernel_7000/2644901071.py:2: FutureWarning: The provided callable <function sum at 0x10520a0e0> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead. pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'],
Out[9]:
| attendance | runs_scored | |||||||
|---|---|---|---|---|---|---|---|---|
| venue | Bengaluru | Hyderabad | Kolkata | Mumbai | Bengaluru | Hyderabad | Kolkata | Mumbai |
| team | ||||||||
| CSK | 35000 | 0 | 0 | 30000 | 170.0 | 0.0 | 0.0 | 165.0 |
| KKR | 0 | 0 | 80000 | 0 | 0.0 | 0.0 | 149.0 | 0.0 |
| MI | 0 | 25000 | 0 | 30000 | 0.0 | 150.0 | 0.0 | 180.0 |
| RCB | 35000 | 0 | 40000 | 0 | 175.0 | 0.0 | 155.0 | 0.0 |
| SRH | 0 | 47000 | 0 | 0 | 0.0 | 161.0 | 0.0 | 0.0 |
In [10]:
# Cell 9: Window functions (rolling, expanding)
df_sorted = df.sort_values(['team','date']).copy()
df_sorted['runs_scored_rolling2'] = df_sorted.groupby('team')['runs_scored'].rolling(window=2, min_periods=1).mean().reset_index(0,drop=True)
df_sorted['runs_scored_expanding_mean'] = df_sorted.groupby('team')['runs_scored'].expanding(min_periods=1).mean().reset_index(0,drop=True)
df_sorted[['team','date','runs_scored','runs_scored_rolling2','runs_scored_expanding_mean']]
Out[10]:
| team | date | runs_scored | runs_scored_rolling2 | runs_scored_expanding_mean | |
|---|---|---|---|---|---|
| 1 | CSK | 2024-03-22 | 165 | 165.0 | 165.0 |
| 5 | CSK | 2024-03-24 | 170 | 167.5 | 167.5 |
| 6 | KKR | 2024-03-25 | 140 | 140.0 | 140.0 |
| 9 | KKR | 2024-03-26 | 158 | 149.0 | 149.0 |
| 0 | MI | 2024-03-22 | 180 | 180.0 | 180.0 |
| 2 | MI | 2024-03-23 | 150 | 165.0 | 165.0 |
| 4 | RCB | 2024-03-24 | 175 | 175.0 | 175.0 |
| 7 | RCB | 2024-03-25 | 155 | 165.0 | 165.0 |
| 3 | SRH | 2024-03-23 | 160 | 160.0 | 160.0 |
| 8 | SRH | 2024-03-26 | 162 | 161.0 | 161.0 |
In [11]:
# Cell 10: Handling missing data
df_missing = df.copy()
df_missing.loc[2,'runs_scored'] = np.nan
df_missing.loc[4,'attendance'] = np.nan
df_missing
# Fill numeric NAs with team-wise mean:
df_missing['runs_scored_filled'] = df_missing.groupby('team')['runs_scored'].transform(lambda x: x.fillna(x.mean()))
df_missing
Out[11]:
| match_id | date | team | opponent | venue | runs_scored | runs_against | result | player_of_match | attendance | runs_scored_filled | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2024-03-22 | MI | CSK | Mumbai | 180.0 | 165 | win | Rohit | 30000.0 | 180.0 |
| 1 | 1 | 2024-03-22 | CSK | MI | Mumbai | 165.0 | 180 | loss | Raina | 30000.0 | 165.0 |
| 2 | 2 | 2024-03-23 | MI | SRH | Hyderabad | NaN | 160 | loss | Jadeja | 25000.0 | 180.0 |
| 3 | 2 | 2024-03-23 | SRH | MI | Hyderabad | 160.0 | 150 | win | Kohli | 25000.0 | 160.0 |
| 4 | 3 | 2024-03-24 | RCB | CSK | Bengaluru | 175.0 | 170 | win | Warner | NaN | 175.0 |
| 5 | 3 | 2024-03-24 | CSK | RCB | Bengaluru | 170.0 | 175 | loss | Du Plessis | 35000.0 | 170.0 |
| 6 | 4 | 2024-03-25 | KKR | RCB | Kolkata | 140.0 | 155 | loss | Russell | 40000.0 | 140.0 |
| 7 | 4 | 2024-03-25 | RCB | KKR | Kolkata | 155.0 | 140 | win | ABD | 40000.0 | 155.0 |
| 8 | 5 | 2024-03-26 | SRH | KKR | Hyderabad | 162.0 | 158 | win | Bhuvi | 22000.0 | 162.0 |
| 9 | 5 | 2024-03-26 | KKR | SRH | Kolkata | 158.0 | 162 | loss | Narine | 40000.0 | 158.0 |
In [12]:
# Cell 11: Categorical data
df['result'] = df['result'].astype('category')
df['result'] = df['result'].cat.reorder_categories(['win','loss'], ordered=True)
df.dtypes
Out[12]:
match_id int64 date datetime64[ns] team object opponent object venue object runs_scored int64 runs_against int64 result category player_of_match object attendance int64 dtype: object
In [13]:
# Cell 12: Time-series analysis
ts = df.set_index('date').sort_index()
weekly = ts['attendance'].resample('W').sum()
attendance_7d = ts['attendance'].rolling('7D').sum()
weekly, attendance_7d
Out[13]:
(date 2024-03-24 180000 2024-03-31 142000 Freq: W-SUN, Name: attendance, dtype: int64, date 2024-03-22 30000.0 2024-03-22 60000.0 2024-03-23 85000.0 2024-03-23 110000.0 2024-03-24 145000.0 2024-03-24 180000.0 2024-03-25 220000.0 2024-03-25 260000.0 2024-03-26 282000.0 2024-03-26 322000.0 Name: attendance, dtype: float64)
In [ ]: