# Pyjanitor

可以将如下代码注释后并执行安装 Pyjanitor

In [1]:
#!pip install pyjanitor

In [2]:
import janitor

# or use `as` key word to set an alias.
import janitor as jn

In [3]:
import janitor as jn
import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, 3, 3],
        "b$231@!#_": list("abc"),
    }
)
data.head()
data.pipe(jn.clean_names, remove_special=True)
data.clean_names(remove_special=True)

Unnamed: 0,a,b231_
0,1,a
1,3,b
2,3,c


## coalesce

In [4]:
# native pandas code

import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, pd.NA, 3, pd.NA],
        "b": [pd.NA, 1.3, pd.NA, pd.NA],
    }
)
data.head()

Unnamed: 0,a,b
0,1.0,
1,,1.3
2,3.0,
3,,


In [5]:
def use_or_not(row):
    a, b = pd.isna(row["a"]), pd.isna(row["b"])
    if a and not b:
        return row["b"]
    elif not a and b:
        return row["a"]
    else:
        return pd.NA

In [6]:
data["c"] = data.apply(use_or_not, axis=1)
data.head()

Unnamed: 0,a,b,c
0,1.0,,1.0
1,,1.3,1.3
2,3.0,,3.0
3,,,


In [7]:
# janitor code
import janitor
import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, pd.NA, 3, pd.NA],
        "b": [pd.NA, 1.3, pd.NA, pd.NA],
        "c": [3, pd.NA, 2, 3.1],
    }
)
data.coalesce("a", "b", target_column_name="c")

Unnamed: 0,a,b,c
0,1.0,,1.0
1,,1.3,1.3
2,3.0,,3.0
3,,,


In [8]:
# DataFrame.bfill
import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, pd.NA, 3, pd.NA],
        "b": [pd.NA, 1.3, pd.NA, pd.NA],
        "c": [3, pd.NA, 2, 3.1],
    }
)
data.head()

Unnamed: 0,a,b,c
0,1.0,,3.0
1,,1.3,
2,3.0,,2.0
3,,,3.1


In [9]:
data.filter(["a", "b"]).bfill(axis=1)

Unnamed: 0,a,b
0,1.0,
1,1.3,1.3
2,3.0,
3,,


In [10]:
data.filter(["a", "b", "c"]).bfill(axis=1)

Unnamed: 0,a,b,c
0,1.0,3.0,3.0
1,1.3,1.3,
2,3.0,2.0,2.0
3,3.1,3.1,3.1


In [11]:
data.filter(["a", "b", "c"]).bfill(axis=1).iloc[:, 0]

0    1.0
1    1.3
2    3.0
3    3.1
Name: a, dtype: float64

## case_when

In [12]:
# native pandas code
import pandas as pd

df = pd.DataFrame(
    {
        "a": [0, 0, 1, 2, "hi"],
        "b": [0, 3, 4, 5, "bye"],
        "c": [6, 7, 8, 9, "wait"],
    }
)

In [13]:
def case_when(row):
    if (row["a"] == 0 and row["b"] != 0) or row["c"] == "wait":
        return row["a"]
    elif row["a"] == 0 and row["b"] == 0:
        return "x"
    else:
        return row["c"]

In [14]:
df.assign(new_col=df.apply(case_when, axis=1))

Unnamed: 0,a,b,c,new_col
0,0,0,6,x
1,0,3,7,0
2,1,4,8,8
3,2,5,9,9
4,hi,bye,wait,hi


In [15]:
# janitor code
import janitor
import pandas as pd

df = pd.DataFrame(
    {
        "a": [0, 0, 1, 2, "hi"],
        "b": [0, 3, 4, 5, "bye"],
        "c": [6, 7, 8, 9, "wait"],
    }
)
df.head()

Unnamed: 0,a,b,c
0,0,0,6
1,0,3,7
2,1,4,8
3,2,5,9
4,hi,bye,wait


In [16]:
# fmt:off
df.case_when(
     ((df.a == 0) & (df.b != 0)) | (df.c == "wait"), df.a,
     (df.b == 0) & (df.a == 0), "x",
     df.c,
     column_name="new_col",
)
# fmt:on

Unnamed: 0,a,b,c,new_col
0,0,0,6,x
1,0,3,7,0
2,1,4,8,8
3,2,5,9,9
4,hi,bye,wait,hi


## concatenate_columns & deconcatenate_column

In [17]:
# native pandas code
import pandas as pd

data = pd.DataFrame(
    {
        "no.": [1, 2, 3],
        "prefix": ["auto", "de", "em"],
        "base": ["matic", "code", "body"],
    }
)

data.head()

Unnamed: 0,no.,prefix,base
0,1,auto,matic
1,2,de,code
2,3,em,body


In [18]:
data["prefix"].str.cat(data["base"], sep="-")

0    auto-matic
1       de-code
2       em-body
Name: prefix, dtype: object

In [19]:
data["prefix"] + "-" + data["base"]

0    auto-matic
1       de-code
2       em-body
dtype: object

In [20]:
(
    data["no."]
    .astype(str)
    .str.cat(data["prefix"], sep="-")
    .str.cat(data["base"], sep="-")
)

0    1-auto-matic
1       2-de-code
2       3-em-body
Name: no., dtype: object

In [21]:
# janitor code
import janitor
import pandas as pd

data = pd.DataFrame(
    {
        "no.": [1, 2, 3],
        "prefix": ["auto", "de", "em"],
        "base": ["matic", "code", "body"],
    }
)
data.concatenate_columns(
    column_names=["no.", "prefix", "base"],
    new_column_name="word",
)

Unnamed: 0,no.,prefix,base,word
0,1,auto,matic,1-auto-matic
1,2,de,code,2-de-code
2,3,em,body,3-em-body


In [22]:
import janitor
import pandas as pd
(
    pd.DataFrame({"date": pd.date_range("20220101", "20220201", freq="1W")})
    .astype(str)
    .deconcatenate_column("date", new_column_names=["year", "month", "day"], sep="-")
    .assign(year_month=lambda df: df["year"].str.cat(df["month"], sep=""))
)

Unnamed: 0,date,year,month,day,year_month
0,2022-01-02,2022,1,2,202201
1,2022-01-09,2022,1,9,202201
2,2022-01-16,2022,1,16,202201
3,2022-01-23,2022,1,23,202201
4,2022-01-30,2022,1,30,202201


## join_apply

In [23]:
# native pandas code
import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, 3, 5],
        "b": [2, 4, 6],
    }
)
data.head()

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6


In [24]:
data.assign(c=lambda df: df.apply(lambda row: (row["a"] * 2 + row["b"]) / 3, axis=1))

Unnamed: 0,a,b,c
0,1,2,1.333333
1,3,4,3.333333
2,5,6,5.333333


In [25]:
# equal this:
data.assign(
    c=(data["a"]*2+data["b"])/3
)

Unnamed: 0,a,b,c
0,1,2,1.333333
1,3,4,3.333333
2,5,6,5.333333


In [26]:
# janitor code
import janitor
import pandas as pd

data = pd.DataFrame(
    {
        "a": [1, 3, 5],
        "b": [2, 4, 6],
    }
)

data.join_apply(lambda row: (row["a"] * 2 + row["b"]) / 3, new_column_name="c")

Unnamed: 0,a,b,c
0,1,2,1.333333
1,3,4,3.333333
2,5,6,5.333333
