AdvancedPandas

Advanced Pandas Tutorial¶

In [2]:

# IPL 2024 — Pandas Cookbook

# Cell 1: Imports & options
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

In [3]:

# Cell 2: Create sample dataset and save CSV
data = {
    'match_id': [1,1,2,2,3,3,4,4,5,5],
    'date': ['2024-03-22','2024-03-22','2024-03-23','2024-03-23','2024-03-24','2024-03-24','2024-03-25','2024-03-25','2024-03-26','2024-03-26'],
    'team': ['MI','CSK','MI','SRH','RCB','CSK','KKR','RCB','SRH','KKR'],
    'opponent': ['CSK','MI','SRH','MI','CSK','RCB','RCB','KKR','KKR','SRH'],
    'venue': ['Mumbai','Mumbai','Hyderabad','Hyderabad','Bengaluru','Bengaluru','Kolkata','Kolkata','Hyderabad','Kolkata'],
    'runs_scored': [180,165,150,160,175,170,140,155,162,158],
    'runs_against': [165,180,160,150,170,175,155,140,158,162],
    'result': ['win','loss','loss','win','win','loss','loss','win','win','loss'],
    'player_of_match': ['Rohit','Raina','Jadeja','Kohli','Warner','Du Plessis','Russell','ABD','Bhuvi','Narine'],
    'attendance': [30000,30000,25000,25000,35000,35000,40000,40000,22000,40000]
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df.to_csv('ipl2024_sample.csv', index=False)
df.head()

Out[3]:

	match_id	date	team	opponent	venue	runs_scored	runs_against	result	player_of_match	attendance
0	1	2024-03-22	MI	CSK	Mumbai	180	165	win	Rohit	30000
1	1	2024-03-22	CSK	MI	Mumbai	165	180	loss	Raina	30000
2	2	2024-03-23	MI	SRH	Hyderabad	150	160	loss	Jadeja	25000
3	2	2024-03-23	SRH	MI	Hyderabad	160	150	win	Kohli	25000
4	3	2024-03-24	RCB	CSK	Bengaluru	175	170	win	Warner	35000

In [4]:

# Cell 3: DataFrame operations
df = pd.read_csv('ipl2024_sample.csv', parse_dates=['date'])
print("Shape:", df.shape)
df.head()
df.describe(include='all')

Shape: (10, 10)

Out[4]:

	match_id	date	team	opponent	venue	runs_scored	runs_against	result	player_of_match	attendance
count	10.000000	10	10	10	10	10.000000	10.000000	10	10	10.000000
unique	NaN	NaN	5	5	4	NaN	NaN	2	10	NaN
top	NaN	NaN	MI	CSK	Hyderabad	NaN	NaN	win	Rohit	NaN
freq	NaN	NaN	2	2	3	NaN	NaN	5	1	NaN
mean	3.000000	2024-03-24 00:00:00	NaN	NaN	NaN	161.500000	161.500000	NaN	NaN	32200.000000
min	1.000000	2024-03-22 00:00:00	NaN	NaN	NaN	140.000000	140.000000	NaN	NaN	22000.000000
25%	2.000000	2024-03-23 00:00:00	NaN	NaN	NaN	155.750000	155.750000	NaN	NaN	26250.000000
50%	3.000000	2024-03-24 00:00:00	NaN	NaN	NaN	161.000000	161.000000	NaN	NaN	32500.000000
75%	4.000000	2024-03-25 00:00:00	NaN	NaN	NaN	168.750000	168.750000	NaN	NaN	38750.000000
max	5.000000	2024-03-26 00:00:00	NaN	NaN	NaN	180.000000	180.000000	NaN	NaN	40000.000000
std	1.490712	NaN	NaN	NaN	NaN	11.834507	11.834507	NaN	NaN	6795.423296

In [5]:

# Cell 4: Indexing (set_index, loc, iloc, boolean)
df_idx = df.set_index(['match_id','team'])
df_idx.head()
# Example: df_idx.loc[(2,'MI')]
# Example: df.iloc[:2]
# Example: df[df['attendance'] > 30000]

Out[5]:

		date	opponent	venue	runs_scored	runs_against	result	player_of_match	attendance
match_id	team
1	MI	2024-03-22	CSK	Mumbai	180	165	win	Rohit	30000
1	CSK	2024-03-22	MI	Mumbai	165	180	loss	Raina	30000
2	MI	2024-03-23	SRH	Hyderabad	150	160	loss	Jadeja	25000
2	SRH	2024-03-23	MI	Hyderabad	160	150	win	Kohli	25000
3	RCB	2024-03-24	CSK	Bengaluru	175	170	win	Warner	35000

In [6]:

# Cell 5: Merging (team info)
team_info = pd.DataFrame({
    'team': ['MI','CSK','SRH','RCB','KKR'],
    'coach': ['Mahela','Stephen','Trevor','Laxman','Brendon'],
    'captain': ['Rohit','Dhoni','Kane','Virat','Shreyas']
})
merged = df.merge(team_info, on='team', how='left')
merged.head()

Out[6]:

	match_id	date	team	opponent	venue	runs_scored	runs_against	result	player_of_match	attendance	coach	captain
0	1	2024-03-22	MI	CSK	Mumbai	180	165	win	Rohit	30000	Mahela	Rohit
1	1	2024-03-22	CSK	MI	Mumbai	165	180	loss	Raina	30000	Stephen	Dhoni
2	2	2024-03-23	MI	SRH	Hyderabad	150	160	loss	Jadeja	25000	Mahela	Rohit
3	2	2024-03-23	SRH	MI	Hyderabad	160	150	win	Kohli	25000	Trevor	Kane
4	3	2024-03-24	RCB	CSK	Bengaluru	175	170	win	Warner	35000	Laxman	Virat

In [7]:

# Cell 6: Grouping and aggregation
grouped = df.groupby('team').agg(
    matches=('match_id','nunique'),
    total_runs_scored=('runs_scored','sum'),
    avg_attendance=('attendance','mean')
).reset_index()
grouped

Out[7]:

	team	matches	total_runs_scored	avg_attendance
0	CSK	2	335	32500.0
1	KKR	2	298	40000.0
2	MI	2	330	27500.0
3	RCB	2	330	37500.0
4	SRH	2	322	23500.0

In [8]:

# Cell 7: MultiIndex / pivot / stack / unstack
pivot = df.pivot_table(index=['team'], columns='result', values='runs_scored', aggfunc='count', fill_value=0)
pivot

Out[8]:

result	loss	win
team
CSK	2	0
KKR	2	0
MI	1	1
RCB	0	2
SRH	0	2

In [9]:

# Cell 8: Pivot table with multiple aggregations
pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'],
                    aggfunc={'runs_scored': np.mean, 'attendance': np.sum}, fill_value=0)
pv

/var/folders/43/0syn7psx5w1fdxg82fm99xpc0000gn/T/ipykernel_7000/2644901071.py:2: FutureWarning: The provided callable <function mean at 0x10520b2e0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
  pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'],
/var/folders/43/0syn7psx5w1fdxg82fm99xpc0000gn/T/ipykernel_7000/2644901071.py:2: FutureWarning: The provided callable <function sum at 0x10520a0e0> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
  pv = pd.pivot_table(df, index=['team'], columns=['venue'], values=['runs_scored','attendance'],

Out[9]:

	attendance				runs_scored
venue	Bengaluru	Hyderabad	Kolkata	Mumbai	Bengaluru	Hyderabad	Kolkata	Mumbai
team
CSK	35000	0	0	30000	170.0	0.0	0.0	165.0
KKR	0	0	80000	0	0.0	0.0	149.0	0.0
MI	0	25000	0	30000	0.0	150.0	0.0	180.0
RCB	35000	0	40000	0	175.0	0.0	155.0	0.0
SRH	0	47000	0	0	0.0	161.0	0.0	0.0

In [10]:

# Cell 9: Window functions (rolling, expanding)
df_sorted = df.sort_values(['team','date']).copy()
df_sorted['runs_scored_rolling2'] = df_sorted.groupby('team')['runs_scored'].rolling(window=2, min_periods=1).mean().reset_index(0,drop=True)
df_sorted['runs_scored_expanding_mean'] = df_sorted.groupby('team')['runs_scored'].expanding(min_periods=1).mean().reset_index(0,drop=True)
df_sorted[['team','date','runs_scored','runs_scored_rolling2','runs_scored_expanding_mean']]

Out[10]:

	team	date	runs_scored	runs_scored_rolling2	runs_scored_expanding_mean
1	CSK	2024-03-22	165	165.0	165.0
5	CSK	2024-03-24	170	167.5	167.5
6	KKR	2024-03-25	140	140.0	140.0
9	KKR	2024-03-26	158	149.0	149.0
0	MI	2024-03-22	180	180.0	180.0
2	MI	2024-03-23	150	165.0	165.0
4	RCB	2024-03-24	175	175.0	175.0
7	RCB	2024-03-25	155	165.0	165.0
3	SRH	2024-03-23	160	160.0	160.0
8	SRH	2024-03-26	162	161.0	161.0

In [11]:

# Cell 10: Handling missing data
df_missing = df.copy()
df_missing.loc[2,'runs_scored'] = np.nan
df_missing.loc[4,'attendance'] = np.nan
df_missing
# Fill numeric NAs with team-wise mean:
df_missing['runs_scored_filled'] = df_missing.groupby('team')['runs_scored'].transform(lambda x: x.fillna(x.mean()))
df_missing

Out[11]:

	match_id	date	team	opponent	venue	runs_scored	runs_against	result	player_of_match	attendance	runs_scored_filled
0	1	2024-03-22	MI	CSK	Mumbai	180.0	165	win	Rohit	30000.0	180.0
1	1	2024-03-22	CSK	MI	Mumbai	165.0	180	loss	Raina	30000.0	165.0
2	2	2024-03-23	MI	SRH	Hyderabad	NaN	160	loss	Jadeja	25000.0	180.0
3	2	2024-03-23	SRH	MI	Hyderabad	160.0	150	win	Kohli	25000.0	160.0
4	3	2024-03-24	RCB	CSK	Bengaluru	175.0	170	win	Warner	NaN	175.0
5	3	2024-03-24	CSK	RCB	Bengaluru	170.0	175	loss	Du Plessis	35000.0	170.0
6	4	2024-03-25	KKR	RCB	Kolkata	140.0	155	loss	Russell	40000.0	140.0
7	4	2024-03-25	RCB	KKR	Kolkata	155.0	140	win	ABD	40000.0	155.0
8	5	2024-03-26	SRH	KKR	Hyderabad	162.0	158	win	Bhuvi	22000.0	162.0
9	5	2024-03-26	KKR	SRH	Kolkata	158.0	162	loss	Narine	40000.0	158.0

In [12]:

# Cell 11: Categorical data
df['result'] = df['result'].astype('category')
df['result'] = df['result'].cat.reorder_categories(['win','loss'], ordered=True)
df.dtypes

Out[12]:

match_id                    int64
date               datetime64[ns]
team                       object
opponent                   object
venue                      object
runs_scored                 int64
runs_against                int64
result                   category
player_of_match            object
attendance                  int64
dtype: object

In [13]:

# Cell 12: Time-series analysis
ts = df.set_index('date').sort_index()
weekly = ts['attendance'].resample('W').sum()
attendance_7d = ts['attendance'].rolling('7D').sum()
weekly, attendance_7d

Out[13]:

(date
 2024-03-24    180000
 2024-03-31    142000
 Freq: W-SUN, Name: attendance, dtype: int64,
 date
 2024-03-22     30000.0
 2024-03-22     60000.0
 2024-03-23     85000.0
 2024-03-23    110000.0
 2024-03-24    145000.0
 2024-03-24    180000.0
 2024-03-25    220000.0
 2024-03-25    260000.0
 2024-03-26    282000.0
 2024-03-26    322000.0
 Name: attendance, dtype: float64)

In [ ]:

Advanced Pandas

Advanced Pandas Tutorial¶

Leave a Reply Cancel reply

Quick Links

Resources

Support