# 什么是 DataFrame？

In [112]:
import pandas as pd

data = dict(
    name=["Alice", "Bob", "Carol"],
    age=[10, 20, 30],
)
df = pd.DataFrame(data)
df

Unnamed: 0,name,age
0,Alice,10
1,Bob,20
2,Carol,30


In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [114]:
df["name"] # 1

0    Alice
1      Bob
2    Carol
Name: name, dtype: object

In [115]:
type(df["name"])

pandas.core.series.Series

# 常用的 DataFrame 属性

In [116]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [117]:
df.dtypes

name    object
age      int64
dtype: object

In [118]:
df.shape

(3, 2)

In [119]:
df.size

6

In [120]:
df.values

array([['Alice', 10],
       ['Bob', 20],
       ['Carol', 30]], dtype=object)

In [121]:
df.columns

Index(['name', 'age'], dtype='object')

In [122]:
df.axes

[RangeIndex(start=0, stop=3, step=1), Index(['name', 'age'], dtype='object')]

In [123]:
df.ndim

2

In [124]:
if df.empty:
    print("df is empty")
else:
    print("df is not empty")

df is not empty


# 常用的 DataFrame 方法

## 查看数据信息

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


example：

In [126]:
import random

import pandas as pd

random.seed(233)

data = pd.DataFrame(
    {
        "a": [random.randrange(1, 100) for _ in range(100)],
        "b": [random.randrange(1, 100) for _ in range(100)],
        "c": [random.randrange(1, 100) for _ in range(100)],
    }
)
data

Unnamed: 0,a,b,c
0,88,69,31
1,81,87,40
2,23,18,15
3,68,52,89
4,30,90,73
...,...,...,...
95,95,4,8
96,93,14,1
97,18,91,63
98,27,31,62


In [127]:
data.head()

Unnamed: 0,a,b,c
0,88,69,31
1,81,87,40
2,23,18,15
3,68,52,89
4,30,90,73


In [128]:
data.tail()

Unnamed: 0,a,b,c
95,95,4,8
96,93,14,1
97,18,91,63
98,27,31,62
99,63,73,85


## 索引数据

### 列表式

In [129]:
cols = ["a", "b"]

filtered = data[cols]
filtered.head()

Unnamed: 0,a,b
0,88,69
1,81,87
2,23,18
3,68,52
4,30,90


### 坐标式

In [130]:
import random
import string

random.seed(233)


def make_labels(n):
    alphabet = string.ascii_letters
    number = string.digits

    return ["".join(random.sample(alphabet + number, 5)) for _ in range(n)]


labels = make_labels(100)

data = pd.DataFrame(
    {
        "a": [random.randrange(1, 100) for _ in range(100)],
        "b": [random.randrange(1, 100) for _ in range(100)],
        "c": [random.randrange(1, 100) for _ in range(100)],
    },
    index=labels,
)
data.head()

Unnamed: 0,a,b,c
RZOlH,39,76,77
oZ2EJ,95,41,5
nf1gr,12,75,86
oSTae,4,69,61
7RpMT,51,75,79


In [131]:
data.at["oZ2EJ", "b"]

41

In [132]:
data.iat[2, 1]

75

In [133]:
# index error
# data.iat["oZ2EJ", 1]

### 列表坐标式

In [134]:
data.loc["nf1gr":"oSTae", "a":"b"]

Unnamed: 0,a,b
nf1gr,12,75
oSTae,4,69


In [135]:
data.iloc[2:3, [1, 2]]

Unnamed: 0,b,c
nf1gr,75,86


In [136]:
data.loc[:"7RpMT", :"b"]

Unnamed: 0,a,b
RZOlH,39,76
oZ2EJ,95,41
nf1gr,12,75
oSTae,4,69
7RpMT,51,75


In [137]:
data.iloc[:-1, 0:1]

Unnamed: 0,a
RZOlH,39
oZ2EJ,95
nf1gr,12
oSTae,4
7RpMT,51
...,...
E14KY,77
KDjSI,79
tn75V,85
4ITDH,55


In [138]:
data.loc["nf1gr":"7RpMT", ["a", "c"]]

Unnamed: 0,a,c
nf1gr,12,86
oSTae,4,61
7RpMT,51,79


In [139]:
data.iloc[[0, 1], [0, 2]]

Unnamed: 0,a,c
RZOlH,39,77
oZ2EJ,95,5



## 拼接数据

### append

In [140]:
data1 = pd.DataFrame(
    {
        "a": ["A", "B"],
        "b": [1, 2],
    }
)
data1.head()

Unnamed: 0,a,b
0,A,1
1,B,2


In [141]:
data2 = pd.DataFrame(
    {
        "a": ["C", "D"],
        "b": [3, 4],
    }
)
data2.head()

Unnamed: 0,a,b
0,C,3
1,D,4


In [142]:
data1.append(data2)

Unnamed: 0,a,b
0,A,1
1,B,2
0,C,3
1,D,4


In [143]:
data3 = pd.DataFrame(
    {
        "a": ["E", "F"],
        "c": [5, 6],
    }
)

data1.append(data2).append(data3)

Unnamed: 0,a,b,c
0,A,1.0,
1,B,2.0,
0,C,3.0,
1,D,4.0,
0,E,,5.0
1,F,,6.0


### merge

In [144]:

data1 = pd.DataFrame(
    {
        "a": [1, 2],
        "b": ["a", "b"],
    }
)

data2 = pd.DataFrame(
    {
        "a": [1, 2],
        "b": ["c", "d"],
    }
)

data1.merge(data2, on="a")

Unnamed: 0,a,b_x,b_y
0,1,a,c
1,2,b,d
