Learn Pandas in Detail

What is Pandas? Why do we need to use it?:
. Pandas is a Python library used for working with data sets.
. Pandas allows us to analyze big data and make conclusions based on statistical theories.
. Pandas can clean messy data sets, and make them readable and relevant.
. Relevant data is very important in data science.


pip install pandas

print(pd.__version__)

# It can also be installed through your system's terminal

import pandas as pd

import pandas as pd
a = [1, 7, 2] # List

myVar = pd.Series(a, index=["a", 'b', "c"])

print(a[0])
print(myVar["a"])

# Index access by column index
print(myVar[1])

a = [1, 7, 2]
myVar = pd.Series(a, index = ["x", "y", "z"])

print(myVar)
print(myVar["y"])

calories = {"day1": 420, "day2": 380, "day3": 390}
myVar = pd.Series(calories)
print(myVar)

print(calories["day2"])
print(myVar["day2"])


calories = {"day1": 420, "day2": 380, "day3": 390}
myVar = pd.Series(calories, index = ["day1", "day3"])
print(myVar)


import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [50000, 60000, 55000, 70000]
}

df = pd.DataFrame(data)
print(df)

print(df['Salary'])
print(type(df['Salary']))

print(df['Salary'][0])

print(df['Salary'].cumsum())

series1 = df['Salary']
print(series1[series1 > 60000])

import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [50000, 60000, 55000, 70000]
}

df = pd.DataFrame(data)
print(df)

print(df.loc[0])

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [50000, 60000, 55000, 70000]
}

df = pd.DataFrame(data, index=['a', 'b', 'c', 'd'])
print(df)

print(df.loc['a'])

print(df.loc['a', 'Salary'])

print(df[df['Age'] > 25])

print(df.iloc[1])

print(df.iloc[0, 2])

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [50000, 60000, 55000, 70000]
}

df = pd.DataFrame(data, index=['a', 'b', 'c', 'd'])
print(df)

df.loc['e'] = ['Charlie', 35, 50000]
print(df)

df['Age after 10 years'] = df['Age'] + 10
print(df)

path = "../Data/" # Copy the path of the csv file

file = path + 'Customer Purchase.csv'

import pandas as pd
df = pd.read_csv(file)

print(df.head())

print(df["Age"].isnull().sum())
print(df["Education"].isnull().sum())

print(df.isnull().sum())

print(df.isnull().sum().sum())

print(df.head(20))

rows_with_missing = df[df.isnull().any(axis=1)]
print(rows_with_missing)

# df_dropped = df.dropna()
# df_dropped = df.dropna(how='all')

# df_dropped_columns = df.dropna(axis=1)
# df_dropped_columns
# df_dropped_columns = df.dropna(axis=1, how='all')

df_filled = df.fillna(0)
print(df_filled.head(20))

df2 = df
df2['Review'] = df2['Review'].fillna(df['Age'].mean()) # avg
df2['Education'] = df2['Education'].fillna(df['Age'].median()) # the middle number
df2['Purchased'] = df2['Purchased'].fillna(df['Age'].mode()[0]) # number that appears most
print(df2)

# Save to CSV
df2.to_csv(path + 'output.csv', index=False)

# Save to Excel
df2.to_excel(path + 'output.xlsx', index=False)


2.SampleNotebook

Leave a Reply

Your email address will not be published. Required fields are marked *