Python Pandas Cheatsheet
Remove rows containing specific string from column
# Remove rows containing string 'meow'
df = df[df['row_name'].str.contains('meow') == False]
Sum of multiple columns
# Create a new column called "added" and containing sum of columns 'a' and 'b'
df['added_col'] = df['a'] + df['b']
Update all row values based on condition
# For all rows, modify column 'b' values based on condition on column 'a' values
df.loc[df['a'] >= 30, ['b']] = 'updated_value'
Downsample based on multiple conditions
# Get only the rows that fall within the specified dates
df = df[(df['DATE'] >= '2022-07-01') & (df['DATE'] < '2022-10-01')]
Perform group by
# Group by name, where sum the price and quantity, and take the first value for date
agg_func = { 'PRICE': 'sum', 'QUANTITY': 'sum', 'DATE': 'first' }
df_grouped = df.groupby(df['NAME']).aggregate(agg_func)
Multiplication between multiple columns
# Multiply values in column 'A' with column 'B' and put them in column 'C'
df['C'] = df['A'].astype(int) * df['B'].astype(int)
Get Month from date
# Create a new column 'MONTH' and put the first day of the month as date from another column containing dates
df['Month'] = pd.to_datetime(df['Date']).dt.to_period('M').dt.to_timestamp()
Progress bar for writing out CSV file
# Save a csv file for the given dataframe 'df' and output path 'loc'
# Same output as: df.to_csv(loc, index=False)
def to_csv_with_progress_bar(df, loc):
print('Saving to {} ...'.format(loc))
chunks = np.array_split(df.index, 100)
for chunk, subset in enumerate(tqdm(chunks)):
if chunk == 0:
df.loc[subset].to_csv(loc, mode='w', index=False)
else:
df.loc[subset].to_csv(loc, mode='a', index=False, header=False)