pandas useful tip

2020. 6. 25. 22:05분석 Python/Pandas Tip

728x90
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f) # axis =1 (각 열)

frame.apply(f,axis="columns")

format = lambda x: '%.2f' % x
frame.applymap(format)

report = pd.DataFrame([
        [1, 10, 'John'],
        [1, 20, 'John'],
        [1, 30, 'Tom'],
        [1, 10, 'Bob'],
        [2, 25, 'John'],
        [2, 15, 'Bob']], columns = ['IssueKey','TimeSpent','User'])
time_logged_by_user = report.groupby(['IssueKey', 'User']).TimeSpent.sum()
time_logged_by_user.mean(level= "IssueKey")

data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 5, 4]})
result = data.apply(pd.value_counts).fillna(0)
result

def top3_petal_length(df):
    return df.sort_values(by="petal_length", ascending=False)[:3]

iris.groupby(iris.species).apply(top3_petal_length)

 

 

def q3cut(s):
    return pd.qcut(s, 3, labels=["소", "중", "대"])
    
iris2 = iris.copy()
iris2["petal_length_class"] = iris.groupby(iris.species)["petal_length"].transform(q3cut)
iris2[["petal_length", "petal_length_class"]].tail(10)
iris2

How to find all the local maxima (or peaks) in a numeric series?

ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
# Solution
dd = np.diff(np.sign(np.diff(ser)))
print(dd)
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

 

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

#  number of rows and columns
print(df.shape)

# datatypes
print(df.dtypes)

# how many columns under each dtype
print(df.get_dtype_counts())

 

 

# Input
df = pd.DataFrame(np.random.random(4), columns=['random'])

# Solution
out = df.style.format({
    'random': '{0:.2%}'.format,
})

out

cap outlier

# Input
ser = pd.Series(np.logspace(-2, 2, 30))

# Solution
def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return(ser)

capped_ser = cap_outliers(ser, .05, .95)

onehot (order)

# Input
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))

# Solution
df_onehot = pd.concat([pd.get_dummies(df['a'],prefix="a"),
                       df[list('bcde')]], axis=1)
print(df_onehot)

 

zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies.head()

for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

 

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
regex.findall(text)

 

 

https://www.machinelearningplus.com/python/101-pandas-exercises-python/

 

101 Pandas Exercises for Data Analysis - Machine Learning Plus

101 python pandas exercises are designed to challenge your logical muscle and to help internalize data manipulation with python’s favorite package for data analysis....

www.machinelearningplus.com

 

728x90