Python Pandas Cheatsheet

Remove rows containing specific string from column

# Remove rows containing string 'meow'
df = df[df['row_name'].str.contains('meow') == False]

Sum of multiple columns

# Create a new column called "added" and containing sum of columns 'a' and 'b'
df['added_col'] = df['a'] + df['b']

Update all row values based on condition

# For all rows, modify column 'b' values based on condition on column 'a' values
df.loc[df['a'] >= 30, ['b']] = 'updated_value'

Downsample based on multiple conditions

# Get only the rows that fall within the specified dates
df = df[(df['DATE'] >= '2022-07-01') & (df['DATE'] < '2022-10-01')]

Perform group by

# Group by name, where sum the price and quantity, and take the first value for date
agg_func = { 'PRICE': 'sum', 'QUANTITY': 'sum', 'DATE': 'first' }
df_grouped = df.groupby(df['NAME']).aggregate(agg_func)

Multiplication between multiple columns

# Multiply values in column 'A' with column 'B' and put them in column 'C'
df['C'] = df['A'].astype(int) * df['B'].astype(int)

Get Month from date

# Create a new column 'MONTH' and put the first day of the month as date from another column containing dates
df['Month'] = pd.to_datetime(df['Date']).dt.to_period('M').dt.to_timestamp()

Progress bar for writing out CSV file

# Save a csv file for the given dataframe 'df' and output path 'loc'
# Same output as: df.to_csv(loc, index=False)
def to_csv_with_progress_bar(df, loc):
    print('Saving to {} ...'.format(loc))
    chunks = np.array_split(df.index, 100)
    for chunk, subset in enumerate(tqdm(chunks)):
        if chunk == 0:
            df.loc[subset].to_csv(loc, mode='w', index=False)
        else:
            df.loc[subset].to_csv(loc, mode='a', index=False, header=False)

Leave a Reply

Your email address will not be published. Required fields are marked *