pandasでEDA(1)#
pandasのcheatsheetベースで主要なmethodをおさらいしていく
import pandas as pd
import numpy as np
pandasでDataFrameを作成する#
Dictからpandasを作成する場合#
some_dict = {
"column_1" : [4, 5, 6],
"column_2" : [7, 8, 9],
"column_3" : [10, 11, 12],
}
df = pd.DataFrame(some_dict)
df
column_1 | column_2 | column_3 | |
---|---|---|---|
0 | 4 | 7 | 10 |
1 | 5 | 8 | 11 |
2 | 6 | 9 | 12 |
### あらかじめ存在するデータ配列をDataFrame化する場合
loss = np.random.rand(10)
acc = np.random.rand(10)
df = pd.DataFrame({"loss": loss, "acc": acc})
df
loss | acc | |
---|---|---|
0 | 0.909316 | 0.248571 |
1 | 0.497057 | 0.347929 |
2 | 0.650238 | 0.709900 |
3 | 0.238004 | 0.489734 |
4 | 0.258707 | 0.999128 |
5 | 0.568453 | 0.899979 |
6 | 0.915923 | 0.993366 |
7 | 0.169946 | 0.724387 |
8 | 0.210376 | 0.536203 |
9 | 0.538116 | 0.367618 |
既存のデータシートをloadする場合#
pandasで対応しているフォーマットはさまざま
csv
excel
json
HTML
XML
df = pd.read_csv('../data/world-happiness/2015.csv')
df
Country | Region | Happiness Rank | Happiness Score | Standard Error | Economy (GDP per Capita) | Family | Health (Life Expectancy) | Freedom | Trust (Government Corruption) | Generosity | Dystopia Residual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Switzerland | Western Europe | 1 | 7.587 | 0.03411 | 1.39651 | 1.34951 | 0.94143 | 0.66557 | 0.41978 | 0.29678 | 2.51738 |
1 | Iceland | Western Europe | 2 | 7.561 | 0.04884 | 1.30232 | 1.40223 | 0.94784 | 0.62877 | 0.14145 | 0.43630 | 2.70201 |
2 | Denmark | Western Europe | 3 | 7.527 | 0.03328 | 1.32548 | 1.36058 | 0.87464 | 0.64938 | 0.48357 | 0.34139 | 2.49204 |
3 | Norway | Western Europe | 4 | 7.522 | 0.03880 | 1.45900 | 1.33095 | 0.88521 | 0.66973 | 0.36503 | 0.34699 | 2.46531 |
4 | Canada | North America | 5 | 7.427 | 0.03553 | 1.32629 | 1.32261 | 0.90563 | 0.63297 | 0.32957 | 0.45811 | 2.45176 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
153 | Rwanda | Sub-Saharan Africa | 154 | 3.465 | 0.03464 | 0.22208 | 0.77370 | 0.42864 | 0.59201 | 0.55191 | 0.22628 | 0.67042 |
154 | Benin | Sub-Saharan Africa | 155 | 3.340 | 0.03656 | 0.28665 | 0.35386 | 0.31910 | 0.48450 | 0.08010 | 0.18260 | 1.63328 |
155 | Syria | Middle East and Northern Africa | 156 | 3.006 | 0.05015 | 0.66320 | 0.47489 | 0.72193 | 0.15684 | 0.18906 | 0.47179 | 0.32858 |
156 | Burundi | Sub-Saharan Africa | 157 | 2.905 | 0.08658 | 0.01530 | 0.41587 | 0.22396 | 0.11850 | 0.10062 | 0.19727 | 1.83302 |
157 | Togo | Sub-Saharan Africa | 158 | 2.839 | 0.06727 | 0.20868 | 0.13995 | 0.28443 | 0.36453 | 0.10731 | 0.16681 | 1.56726 |
158 rows × 12 columns
pandasでmethodをchainする#
DataFrameのmethod(renameやquery,headなど)はDataFrameを返すため、複数のmethodを組み合わせることが可能
df.query("Region == 'Western Europe'").sort_values(by="Health (Life Expectancy)", ascending=False).head(n=10)
Country | Region | Happiness Rank | Happiness Score | Standard Error | Economy (GDP per Capita) | Family | Health (Life Expectancy) | Freedom | Trust (Government Corruption) | Generosity | Dystopia Residual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
35 | Spain | Western Europe | 36 | 6.329 | 0.03468 | 1.23011 | 1.31379 | 0.95562 | 0.45951 | 0.06398 | 0.18227 | 2.12367 |
49 | Italy | Western Europe | 50 | 5.948 | 0.03914 | 1.25114 | 1.19777 | 0.95446 | 0.26236 | 0.02901 | 0.22823 | 2.02518 |
1 | Iceland | Western Europe | 2 | 7.561 | 0.04884 | 1.30232 | 1.40223 | 0.94784 | 0.62877 | 0.14145 | 0.43630 | 2.70201 |
28 | France | Western Europe | 29 | 6.575 | 0.03512 | 1.27778 | 1.26038 | 0.94579 | 0.55011 | 0.20646 | 0.12332 | 2.21126 |
0 | Switzerland | Western Europe | 1 | 7.587 | 0.03411 | 1.39651 | 1.34951 | 0.94143 | 0.66557 | 0.41978 | 0.29678 | 2.51738 |
66 | Cyprus | Western Europe | 67 | 5.689 | 0.05580 | 1.20813 | 0.89318 | 0.92356 | 0.40672 | 0.06146 | 0.30638 | 1.88931 |
65 | North Cyprus | Western Europe | 66 | 5.695 | 0.05635 | 1.20806 | 1.07008 | 0.92356 | 0.49027 | 0.14280 | 0.26169 | 1.59888 |
16 | Luxembourg | Western Europe | 17 | 6.946 | 0.03499 | 1.56391 | 1.21963 | 0.91894 | 0.61583 | 0.37798 | 0.28034 | 1.96961 |
7 | Sweden | Western Europe | 8 | 7.364 | 0.03157 | 1.33171 | 1.28907 | 0.91087 | 0.65980 | 0.43844 | 0.36262 | 2.37119 |
20 | United Kingdom | Western Europe | 21 | 6.867 | 0.01866 | 1.26637 | 1.28548 | 0.90943 | 0.59625 | 0.32067 | 0.51912 | 1.96994 |
df.query("Region == 'Eastern Asia'").sort_values(by="Freedom", ascending=False).head(n=10)
Country | Region | Happiness Rank | Happiness Score | Standard Error | Economy (GDP per Capita) | Family | Health (Life Expectancy) | Freedom | Trust (Government Corruption) | Generosity | Dystopia Residual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
71 | Hong Kong | Eastern Asia | 72 | 5.474 | 0.05051 | 1.38604 | 1.05818 | 1.01328 | 0.59608 | 0.37124 | 0.39478 | 0.65429 |
83 | China | Eastern Asia | 84 | 5.140 | 0.02424 | 0.89012 | 0.94675 | 0.81658 | 0.51697 | 0.02781 | 0.08185 | 1.86040 |
45 | Japan | Eastern Asia | 46 | 5.987 | 0.03581 | 1.27074 | 1.25712 | 0.99111 | 0.49615 | 0.18060 | 0.10705 | 1.68435 |
99 | Mongolia | Eastern Asia | 100 | 4.874 | 0.03313 | 0.82819 | 1.30060 | 0.60268 | 0.43626 | 0.02666 | 0.33230 | 1.34759 |
37 | Taiwan | Eastern Asia | 38 | 6.298 | 0.03868 | 1.29098 | 1.07617 | 0.87530 | 0.39740 | 0.08129 | 0.25376 | 2.32323 |
46 | South Korea | Eastern Asia | 47 | 5.984 | 0.04098 | 1.24461 | 0.95774 | 0.96538 | 0.33208 | 0.07857 | 0.18557 | 2.21978 |
統計情報を取得する#
# .describe()で全体感を把握する
df.describe()
Happiness Rank | Happiness Score | Standard Error | Economy (GDP per Capita) | Family | Health (Life Expectancy) | Freedom | Trust (Government Corruption) | Generosity | Dystopia Residual | |
---|---|---|---|---|---|---|---|---|---|---|
count | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 | 158.000000 |
mean | 79.493671 | 5.375734 | 0.047885 | 0.846137 | 0.991046 | 0.630259 | 0.428615 | 0.143422 | 0.237296 | 2.098977 |
std | 45.754363 | 1.145010 | 0.017146 | 0.403121 | 0.272369 | 0.247078 | 0.150693 | 0.120034 | 0.126685 | 0.553550 |
min | 1.000000 | 2.839000 | 0.018480 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.328580 |
25% | 40.250000 | 4.526000 | 0.037268 | 0.545808 | 0.856822 | 0.439185 | 0.328330 | 0.061675 | 0.150553 | 1.759410 |
50% | 79.500000 | 5.232500 | 0.043940 | 0.910245 | 1.029510 | 0.696705 | 0.435515 | 0.107220 | 0.216130 | 2.095415 |
75% | 118.750000 | 6.243750 | 0.052300 | 1.158448 | 1.214405 | 0.811013 | 0.549092 | 0.180255 | 0.309883 | 2.462415 |
max | 158.000000 | 7.587000 | 0.136930 | 1.690420 | 1.402230 | 1.025250 | 0.669730 | 0.551910 | 0.795880 | 3.602140 |