數(shù)據(jù)科學(xué)和人工智能技術(shù)筆記 十九、數(shù)據(jù)整理(5)

十九、數(shù)據(jù)整理(5)

作者:Chris Albon

譯者:飛龍

協(xié)議:CC BY-NC-SA 4.0

規(guī)范化一列

# 導(dǎo)入所需模塊
import pandas as pd
from sklearn import preprocessing

# 設(shè)置圖表為內(nèi)聯(lián)
%matplotlib inline

# 創(chuàng)建示例數(shù)據(jù)幀,帶有未規(guī)范化的一列
data = {'score': [234,24,14,27,-74,46,73,-18,59,160]}
df = pd.DataFrame(data)
df
score
0 234
1 24
2 14
3 27
4 -74
5 46
6 73
7 -18
8 59
9 160
# 查看為未規(guī)范化的數(shù)據(jù)
df['score'].plot(kind='bar')

# <matplotlib.axes._subplots.AxesSubplot at 0x11b9c88d0> 
png
# 創(chuàng)建 x,其中 x 的得分列的值為浮點(diǎn)數(shù)
x = df[['score']].values.astype(float)

# 創(chuàng)建 minmax 處理器對(duì)象
min_max_scaler = preprocessing.MinMaxScaler()

# 創(chuàng)建一個(gè)對(duì)象,轉(zhuǎn)換數(shù)據(jù),擬合 minmax 處理器
x_scaled = min_max_scaler.fit_transform(x)

# 在數(shù)據(jù)幀上運(yùn)行規(guī)范化器
df_normalized = pd.DataFrame(x_scaled)

# 查看數(shù)據(jù)幀
df_normalized
0
0 1.000000
1 0.318182
2 0.285714
3 0.327922
4 0.000000
5 0.389610
6 0.477273
7 0.181818
8 0.431818
9 0.759740
# 繪制數(shù)據(jù)幀
df_normalized.plot(kind='bar')

# <matplotlib.axes._subplots.AxesSubplot at 0x11ba31c50> 
png

Pandas 中的級(jí)聯(lián)表

# 導(dǎo)入模塊
import pandas as pd

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'TestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'TestScore'])
df
regiment company TestScore
0 Nighthawks 1st 4
1 Nighthawks 1st 24
2 Nighthawks 2nd 31
3 Nighthawks 2nd 2
4 Dragoons 1st 3
5 Dragoons 1st 4
6 Dragoons 2nd 24
7 Dragoons 2nd 31
8 Scouts 1st 2
9 Scouts 1st 3
10 Scouts 2nd 2
11 Scouts 2nd 3
# 按公司和團(tuán)隊(duì)創(chuàng)建分組均值的透視表
pd.pivot_table(df, index=['regiment','company'], aggfunc='mean')
TestScore
regiment company
Dragoons 1st 3.5
2nd 27.5
Nighthawks 1st 14.0
2nd 16.5
Scouts 1st 2.5
2nd 2.5
# 按公司和團(tuán)隊(duì)創(chuàng)建分組計(jì)數(shù)的透視表
df.pivot_table(index=['regiment','company'], aggfunc='count')
TestScore
regiment company
Dragoons 1st 2
2nd 2
Nighthawks 1st 2
2nd 2
Scouts 1st 2
2nd 2

在 Pandas 中快速修改字符串列

我經(jīng)常需要或想要改變一串字符串中所有項(xiàng)目的大小寫(xiě)(例如BRAZILBrazil等)。 有很多方法可以實(shí)現(xiàn)這一目標(biāo),但我已經(jīng)確定這是最容易和最快的方法。

# 導(dǎo)入 pandas
import pandas as pd

# 創(chuàng)建名稱(chēng)的列表
first_names = pd.Series(['Steve Murrey', 'Jane Fonda', 'Sara McGully', 'Mary Jane'])

# 打印列
first_names

'''
0    Steve Murrey
1      Jane Fonda
2    Sara McGully
3       Mary Jane
dtype: object 
'''

# 打印列的小寫(xiě)
first_names.str.lower()

'''
0    steve murrey
1      jane fonda
2    sara mcgully
3       mary jane
dtype: object 
'''

# 打印列的大寫(xiě)
first_names.str.upper()

'''
0    STEVE MURREY
1      JANE FONDA
2    SARA MCGULLY
3       MARY JANE
dtype: object 
'''

# 打印列的標(biāo)題大小寫(xiě)
first_names.str.title()

'''
0    Steve Murrey
1      Jane Fonda
2    Sara Mcgully
3       Mary Jane
dtype: object 
'''

# 打印以空格分割的列
first_names.str.split(" ")

'''
0    [Steve, Murrey]
1      [Jane, Fonda]
2    [Sara, McGully]
3       [Mary, Jane]
dtype: object 
'''

# 打印首字母大寫(xiě)的列
first_names.str.capitalize()

'''
0    Steve murrey
1      Jane fonda
2    Sara mcgully
3       Mary jane
dtype: object 
'''

明白了吧。更多字符串方法在這里

隨機(jī)抽樣數(shù)據(jù)幀

# 導(dǎo)入模塊
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 4 25
1 Molly Jacobson 52 24 94
2 Tina Ali 36 31 57
3 Jake Milner 24 2 62
4 Amy Cooze 73 3 70
# 不放回選擇大小為 2 的隨機(jī)子集
df.take(np.random.permutation(len(df))[:2])
first_name last_name age preTestScore postTestScore
1 Molly Jacobson 52 24 94
4 Amy Cooze 73 3 70

對(duì)數(shù)據(jù)幀的行排名

# 導(dǎo)入模塊
import pandas as pd

# 創(chuàng)建數(shù)據(jù)幀
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
coverage name reports year
Cochice 25 Jason 4 2012
Pima 94 Molly 24 2012
Santa Cruz 57 Tina 31 2013
Maricopa 62 Jake 2 2014
Yuma 70 Amy 3 2014

5 rows × 4 columns

# 創(chuàng)建一個(gè)新列,該列是 coverage 值的升序排名
df['coverageRanked'] = df['coverage'].rank(ascending=1)
df
coverage name reports year coverageRanked
Cochice 25 Jason 4 2012 1
Pima 94 Molly 24 2012 5
Santa Cruz 57 Tina 31 2013 2
Maricopa 62 Jake 2 2014 3
Yuma 70 Amy 3 2014 4

5 rows × 5 columns

正則表達(dá)式基礎(chǔ)

# 導(dǎo)入正則包
import re

import sys

text = 'The quick brown fox jumped over the lazy black bear.'

three_letter_word = '\w{3}'

pattern_re = re.compile(three_letter_word); pattern_re

re.compile(r'\w{3}', re.UNICODE) 

re_search = re.search('..own', text)

if re_search:
    # 打印搜索結(jié)果
    print(re_search.group())

# brown 

re.match

re.match()僅用于匹配字符串的開(kāi)頭或整個(gè)字符串。對(duì)于其他任何內(nèi)容,請(qǐng)使用re.search。

Match all three letter words in text

# 在文本中匹配所有三個(gè)字母的單詞
re_match = re.match('..own', text)

if re_match:
    # 打印所有匹配
    print(re_match.group())
else:
    # 打印這個(gè)
    print('No matches')

# No matches 

re.split

# 使用 'e' 作為分隔符拆分字符串。
re_split = re.split('e', text); re_split

# ['Th', ' quick brown fox jump', 'd ov', 'r th', ' lazy black b', 'ar.'] 

re.sub

用其他東西替換正則表達(dá)式模式串。3表示要進(jìn)行的最大替換次數(shù)。

# 用 'E' 替換前三個(gè) 'e' 實(shí)例,然后打印出來(lái)
re_sub = re.sub('e', 'E', text, 3); print(re_sub)

# ThE quick brown fox jumpEd ovEr the lazy black bear. 

正則表達(dá)式示例

# 導(dǎo)入 regex
import re

# 創(chuàng)建一些數(shù)據(jù)
text = 'A flock of 120 quick brown foxes jumped over 30 lazy brown, bears.'

re.findall('^A', text)

# ['A'] 

re.findall('bears.$', text)

# ['bears.'] 

re.findall('f..es', text)

# ['foxes'] 

# 尋找所有元音
re.findall('[aeiou]', text)

# ['o', 'o', 'u', 'i', 'o', 'o', 'e', 'u', 'e', 'o', 'e', 'a', 'o', 'e', 'a'] 

# 查找不是小寫(xiě)元音的所有字符
re.findall('[^aeiou]', text)

'''
['A',
 ' ',
 'f',
 'l',
 'c',
 'k',
 ' ',
 'f',
 ' ',
 '1',
 '2',
 '0',
 ' ',
 'q',
 'c',
 'k',
 ' ',
 'b',
 'r',
 'w',
 'n',
 ' ',
 'f',
 'x',
 's',
 ' ',
 'j',
 'm',
 'p',
 'd',
 ' ',
 'v',
 'r',
 ' ',
 '3',
 '0',
 ' ',
 'l',
 'z',
 'y',
 ' ',
 'b',
 'r',
 'w',
 'n',
 ',',
 ' ',
 'b',
 'r',
 's',
 '.'] 
'''

re.findall('a|A', text)

# ['A', 'a', 'a'] 

# 尋找任何 'fox' 的實(shí)例
re.findall('(foxes)', text)

# ['foxes'] 

# 尋找所有五個(gè)字母的單詞
re.findall('\w\w\w\w\w', text)

# ['flock', 'quick', 'brown', 'foxes', 'jumpe', 'brown', 'bears'] 

re.findall('\W\W', text)

# [', '] 

re.findall('\s', text)

# [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '] 

re.findall('\S\S', text)

'''
['fl',
 'oc',
 'of',
 '12',
 'qu',
 'ic',
 'br',
 'ow',
 'fo',
 'xe',
 'ju',
 'mp',
 'ed',
 'ov',
 'er',
 '30',
 'la',
 'zy',
 'br',
 'ow',
 'n,',
 'be',
 'ar',
 's.'] 
'''

re.findall('\d\d\d', text)

# ['120'] 

re.findall('\D\D\D\D\D', text)

'''
['A flo',
 'ck of',
 ' quic',
 'k bro',
 'wn fo',
 'xes j',
 'umped',
 ' over',
 ' lazy',
 ' brow',
 'n, be'] 
'''

re.findall('\AA', text)

# ['A'] 

re.findall('bears.\Z', text)

# ['bears.'] 

re.findall('\b[foxes]', text)

# [] 

re.findall('\n', text)

# [] 

re.findall('[Ff]oxes', 'foxes Foxes Doxes')

# ['foxes', 'Foxes'] 

re.findall('[Ff]oxes', 'foxes Foxes Doxes')

# ['foxes', 'Foxes'] 

re.findall('[a-z]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', 'o', 'x', 'e', 's'] 

re.findall('[A-Z]', 'foxes Foxes')

# ['F'] 

re.findall('[a-zA-Z0-9]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', 'F', 'o', 'x', 'e', 's'] 

re.findall('[^aeiou]', 'foxes Foxes')

# ['f', 'x', 's', ' ', 'F', 'x', 's'] 

re.findall('[^0-9]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', ' ', 'F', 'o', 'x', 'e', 's'] 

re.findall('foxes?', 'foxes Foxes')

# ['foxes'] 

re.findall('ox*', 'foxes Foxes')

# ['ox', 'ox'] 

re.findall('ox+', 'foxes Foxes')

# ['ox', 'ox'] 

re.findall('\d{3}', text)

# ['120'] 

re.findall('\d{2,}', text)

# ['120', '30'] 

re.findall('\d{2,3}', text)

# ['120', '30'] 

re.findall('^A', text)

# ['A'] 

re.findall('bears.$', text)

# ['bears.'] 

re.findall('\AA', text)

# ['A'] 

re.findall('bears.\Z', text)

# ['bears.'] 

re.findall('bears(?=.)', text)

# ['bears'] 

re.findall('foxes(?!!)', 'foxes foxes!')

# ['foxes'] 

re.findall('foxes|foxes!', 'foxes foxes!')

# ['foxes', 'foxes'] 

re.findall('fox(es!)', 'foxes foxes!')

# ['es!'] 

re.findall('foxes(!)', 'foxes foxes!')

# ['!'] 

重索引序列和數(shù)據(jù)幀

# 導(dǎo)入模塊
import pandas as pd
import numpy as np

# 創(chuàng)建亞利桑那州南部的火災(zāi)風(fēng)險(xiǎn)序列
brushFireRisk = pd.Series([34, 23, 12, 23], index = ['Bisbee', 'Douglas', 'Sierra Vista', 'Tombstone'])
brushFireRisk

'''
Bisbee          34
Douglas         23
Sierra Vista    12
Tombstone       23
dtype: int64 
'''

# 重索引這個(gè)序列并創(chuàng)建一個(gè)新的序列變量
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'])
brushFireRiskReindexed

'''
Tombstone       23.0
Douglas         23.0
Bisbee          34.0
Sierra Vista    12.0
Barley           NaN
Tucson           NaN
dtype: float64 
'''

# 重索引序列并在任何缺失的索引處填入 0
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'], fill_value = 0)
brushFireRiskReindexed

'''
Tombstone       23
Douglas         23
Bisbee          34
Sierra Vista    12
Barley           0
Tucson           0
dtype: int64 
'''

# 創(chuàng)建數(shù)據(jù)幀
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df
county reports year
0 Cochice 4 2012
1 Pima 24 2012
2 Santa Cruz 31 2013
3 Maricopa 2 2014
4 Yuma 3 2014
# 更改行的順序(索引)
df.reindex([4, 3, 2, 1, 0])
county reports year
4 Yuma 3 2014
3 Maricopa 2 2014
2 Santa Cruz 31 2013
1 Pima 24 2012
0 Cochice 4 2012
# 更改列的順序(索引)
columnsTitles = ['year', 'reports', 'county']
df.reindex(columns=columnsTitles)
year reports county
0 2012 4 Cochice
1 2012 24 Pima
2 2013 31 Santa Cruz
3 2014 2 Maricopa
4 2014 3 Yuma

重命名列標(biāo)題

來(lái)自 StackOverflow 上的 rgalbo

# 導(dǎo)入所需模塊
import pandas as pd

# 創(chuàng)建列表的字典,作為值
raw_data = {'0': ['first_name', 'Molly', 'Tina', 'Jake', 'Amy'], 
        '1': ['last_name', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        '2': ['age', 52, 36, 24, 73], 
        '3': ['preTestScore', 24, 31, 2, 3]}

# 創(chuàng)建數(shù)據(jù)幀
df = pd.DataFrame(raw_data)

# 查看數(shù)據(jù)幀
df
0 1 2 3
0 first_name last_name age preTestScore
1 Molly Jacobson 52 24
2 Tina Ali 36 31
3 Jake Milner 24 2
4 Amy Cooze 73 3
# 從數(shù)據(jù)集的第一行創(chuàng)建一個(gè)名為 header 的新變量
header = df.iloc[0]

'''
0      first_name
1       last_name
2             age
3    preTestScore
Name: 0, dtype: object 
'''

# 將數(shù)據(jù)幀替換為不包含第一行的新數(shù)據(jù)幀
df = df[1:]

# 使用標(biāo)題變量重命名數(shù)據(jù)幀的列值
df.rename(columns = header)
first_name last_name age preTestScore
1 Molly Jacobson 52 24
--- --- --- --- ---
2 Tina Ali 36 31
--- --- --- --- ---
3 Jake Milner 24 2
--- --- --- --- ---
4 Amy Cooze 73 3
--- --- --- --- ---

重命名多個(gè)數(shù)據(jù)幀的列名

# 導(dǎo)入模塊
import pandas as pd

# 設(shè)置 ipython 的最大行顯示
pd.set_option('display.max_row', 1000)

# 設(shè)置 ipython 的最大列寬
pd.set_option('display.max_columns', 50)

# 創(chuàng)建示例數(shù)據(jù)幀
data = {'Commander': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'Date': ['2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08'], 
        'Score': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
Commander Date Score
Cochice Jason 2012, 02, 08 4
Pima Molly 2012, 02, 08 24
Santa Cruz Tina 2012, 02, 08 31
Maricopa Jake 2012, 02, 08 2
Yuma Amy 2012, 02, 08 3
# 重命名列名
df.columns = ['Leader', 'Time', 'Score']

df
Leader Time Score
Cochice Jason 2012, 02, 08 4
Pima Molly 2012, 02, 08 24
Santa Cruz Tina 2012, 02, 08 31
Maricopa Jake 2012, 02, 08 2
Yuma Amy 2012, 02, 08 3
df.rename(columns={'Leader': 'Commander'}, inplace=True)

df
Commander Time Score
Cochice Jason 2012, 02, 08 4
Pima Molly 2012, 02, 08 24
Santa Cruz Tina 2012, 02, 08 31
Maricopa Jake 2012, 02, 08 2
Yuma Amy 2012, 02, 08 3

替換值

# 導(dǎo)入模塊
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [-999, -999, -999, 2, 1],
        'postTestScore': [2, 2, -999, 2, -999]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 -999 2
1 Molly Jacobson 52 -999 2
2 Tina Ali 36 -999 -999
3 Jake Milner 24 2 2
4 Amy Cooze 73 1 -999
# 將所有 -999 替換為 NAN
df.replace(-999, np.nan)
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 NaN 2.0
1 Molly Jacobson 52 NaN 2.0
2 Tina Ali 36 NaN NaN
3 Jake Milner 24 2.0 2.0
4 Amy Cooze 73 1.0 NaN

將數(shù)據(jù)幀保存為 CSV

# 導(dǎo)入模塊
import pandas as pd

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
first_name last_name age preTestScore postTestScore
0 Jason Miller 42 4 25
1 Molly Jacobson 52 24 94
2 Tina Ali 36 31 57
3 Jake Milner 24 2 62
4 Amy Cooze 73 3 70

將名為df的數(shù)據(jù)幀保存為 csv。

df.to_csv('example.csv')
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容