pandas學習記錄
環(huán)境配置
1.python3.6.5 pandas0.19.2
pip install pandas
1.通過pandas讀取csv文件,及常用的csv方法
import pandas as pd
csv_path = './test.csv'
file = pd.read_csv(csv_path, skiprows=1, na_values="missing")
print(file)
# import進入pandas庫,將csv文件的路徑放入一個變量,使用read_csv的方法讀取csv文件
# skiprows 用于指定跳過csv文件的頭部前幾行,na_values 用于指定占位符
print(file.head(5))
# 取文件的前五行數(shù)據(jù)
# 顯示所有列
pd.set_option('display.max_columns', None)
# 顯示所有行
pd.set_option('display.max_rows', None)
# 設置value的顯示長度為100,默認為50
pd.set_option('max_colwidth', 100)
test = []? # 新建一個空的列表
for index, row in file.iterrows():? # 使用iterrows方法遍歷,該方法會返回兩個對象,index和row
? ? if row["org_id"] == 13486:? # 判斷返回的row對象中指定字段是否存在
? ? ? ? test.append(row)? # 如果存在將整行數(shù)據(jù)添加到test列表中
test = pd.DataFrame(test)? # 遍歷結束后把列表轉為DataFrame對象
print(test)
test.to_csv("aaa.csv")? # 將test寫入csv
left = pd.DataFrame({'id': [1, 1], 'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'id': [1, 1], 'key': ['foo', 'foo'], 'rval': [4, 5]})
# print(pd.merge(left, right, on='id'))
# merge join連接兩個dataframe對象, on= 通過指定的字段連接
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
? ? ? ? ? ? ? ? ? ? ? ? 'foo', 'bar', 'foo', 'foo'],
? ? ? ? ? ? ? ? ? 'B': ['one', 'one', 'two', 'three',
? ? ? ? ? ? ? ? ? ? ? ? 'two', 'two', 'one', 'three'],
? ? ? ? ? ? ? ? ? 'C': np.random.randn(8),
? ? ? ? ? ? ? ? ? 'D': np.random.randn(8)})
# print(df)
# print(df.groupby(['A', 'B']).sum())
# groupby 求和
set_option()的所有屬性:
Available options:
- display.[chop_threshold, colheader_justify, column_space, date_dayfirst,
? date_yearfirst, encoding, expand_frame_repr, float_format, height, large_repr]
- display.latex.[escape, longtable, repr]
- display.[line_width, max_categories, max_columns, max_colwidth,
? max_info_columns, max_info_rows, max_rows, max_seq_items, memory_usage,
? mpl_style, multi_sparse, notebook_repr_html, pprint_nest_depth, precision,
? show_dimensions]
- display.unicode.[ambiguous_as_wide, east_asian_width]
- display.[width]
- io.excel.xls.[writer]
- io.excel.xlsm.[writer]
- io.excel.xlsx.[writer]
- io.hdf.[default_format, dropna_table]
- mode.[chained_assignment, sim_interactive, use_inf_as_null]
Parameters
----------
pat : str
? ? Regexp which should match a single option.
? ? Note: partial matches are supported for convenience, but unless you use the
? ? full option name (e.g. x.y.z.option_name), your code may break in future
? ? versions if new options with similar names are introduced.
value :
? ? new value of option.
Returns
-------
None
Raises
------
OptionError if no such option exists
Notes
-----
The available options with its descriptions:
display.chop_threshold : float or None
? ? if set to a float value, all float values smaller then the given threshold
? ? will be displayed as exactly 0 by repr and friends.
? ? [default: None] [currently: None]
display.colheader_justify : 'left'/'right'
? ? Controls the justification of column headers. used by DataFrameFormatter.
? ? [default: right] [currently: right]
display.column_space No description available.
? ? [default: 12] [currently: 12]
display.date_dayfirst : boolean
? ? When True, prints and parses dates with the day first, eg 20/01/2005
? ? [default: False] [currently: False]
display.date_yearfirst : boolean
? ? When True, prints and parses dates with the year first, eg 2005/01/20
? ? [default: False] [currently: False]
display.encoding : str/unicode
? ? Defaults to the detected encoding of the console.
? ? Specifies the encoding to be used for strings returned by to_string,
? ? these are generally strings meant to be displayed on the console.
? ? [default: UTF-8] [currently: UTF-8]
display.expand_frame_repr : boolean
? ? Whether to print out the full DataFrame repr for wide DataFrames across
? ? multiple lines, `max_columns` is still respected, but the output will
? ? wrap-around across multiple "pages" if its width exceeds `display.width`.
? ? [default: True] [currently: True]
display.float_format : callable
? ? The callable should accept a floating point number and return
? ? a string with the desired format of the number. This is used
? ? in some places like SeriesFormatter.
? ? See formats.format.EngFormatter for an example.
? ? [default: None] [currently: None]
display.height : int
? ? Deprecated.
? ? [default: 60] [currently: 60]
? ? (Deprecated, use `display.max_rows` instead.)
display.large_repr : 'truncate'/'info'
? ? For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
? ? show a truncated table (the default from 0.13), or switch to the view from
? ? df.info() (the behaviour in earlier versions of pandas).
? ? [default: truncate] [currently: truncate]
display.latex.escape : bool
? ? This specifies if the to_latex method of a Dataframe uses escapes special
? ? characters.
? ? method. Valid values: False,True
? ? [default: True] [currently: True]
display.latex.longtable :bool
? ? This specifies if the to_latex method of a Dataframe uses the longtable
? ? format.
? ? method. Valid values: False,True
? ? [default: False] [currently: False]
display.latex.repr : boolean
? ? Whether to produce a latex DataFrame representation for jupyter
? ? environments that support it.
? ? (default: False)
? ? [default: False] [currently: False]
display.line_width : int
? ? Deprecated.
? ? [default: 80] [currently: 80]
? ? (Deprecated, use `display.width` instead.)
display.max_categories : int
? ? This sets the maximum number of categories pandas should output when
? ? printing out a `Categorical` or a Series of dtype "category".
? ? [default: 8] [currently: 8]
display.max_columns : int
? ? If max_cols is exceeded, switch to truncate view. Depending on
? ? `large_repr`, objects are either centrally truncated or printed as
? ? a summary view. 'None' value means unlimited.
? ? In case python/IPython is running in a terminal and `large_repr`
? ? equals 'truncate' this can be set to 0 and pandas will auto-detect
? ? the width of the terminal and print a truncated object which fits
? ? the screen width. The IPython notebook, IPython qtconsole, or IDLE
? ? do not run in a terminal and hence it is not possible to do
? ? correct auto-detection.
? ? [default: 20] [currently: 20]
display.max_colwidth : int
? ? The maximum width in characters of a column in the repr of
? ? a pandas data structure. When the column overflows, a "..."
? ? placeholder is embedded in the output.
? ? [default: 50] [currently: 200]
display.max_info_columns : int
? ? max_info_columns is used in DataFrame.info method to decide if
? ? per column information will be printed.
? ? [default: 100] [currently: 100]
display.max_info_rows : int or None
? ? df.info() will usually show null-counts for each column.
? ? For large frames this can be quite slow. max_info_rows and max_info_cols
? ? limit this null check only to frames with smaller dimensions than
? ? specified.
? ? [default: 1690785] [currently: 1690785]
display.max_rows : int
? ? If max_rows is exceeded, switch to truncate view. Depending on
? ? `large_repr`, objects are either centrally truncated or printed as
? ? a summary view. 'None' value means unlimited.
? ? In case python/IPython is running in a terminal and `large_repr`
? ? equals 'truncate' this can be set to 0 and pandas will auto-detect
? ? the height of the terminal and print a truncated object which fits
? ? the screen height. The IPython notebook, IPython qtconsole, or
? ? IDLE do not run in a terminal and hence it is not possible to do
? ? correct auto-detection.
? ? [default: 60] [currently: 60]
display.max_seq_items : int or None
? ? when pretty-printing a long sequence, no more then `max_seq_items`
? ? will be printed. If items are omitted, they will be denoted by the
? ? addition of "..." to the resulting string.
? ? If set to None, the number of items to be printed is unlimited.
? ? [default: 100] [currently: 100]
display.memory_usage : bool, string or None
? ? This specifies if the memory usage of a DataFrame should be displayed when
? ? df.info() is called. Valid values True,False,'deep'
? ? [default: True] [currently: True]
display.mpl_style : bool
? ? Setting this to 'default' will modify the rcParams used by matplotlib
? ? to give plots a more pleasing visual style by default.
? ? Setting this to None/False restores the values to their initial value.
? ? [default: None] [currently: None]
display.multi_sparse : boolean
? ? "sparsify" MultiIndex display (don't display repeated
? ? elements in outer levels within groups)
? ? [default: True] [currently: True]
display.notebook_repr_html : boolean
? ? When True, IPython notebook will use html representation for
? ? pandas objects (if it is available).
? ? [default: True] [currently: True]
display.pprint_nest_depth : int
? ? Controls the number of nested levels to process when pretty-printing
? ? [default: 3] [currently: 3]
display.precision : int
? ? Floating point output precision (number of significant digits). This is
? ? only a suggestion
? ? [default: 6] [currently: 6]
display.show_dimensions : boolean or 'truncate'
? ? Whether to print out dimensions at the end of DataFrame repr.
? ? If 'truncate' is specified, only print out the dimensions if the
? ? frame is truncated (e.g. not display all rows and/or columns)
? ? [default: truncate] [currently: truncate]
display.unicode.ambiguous_as_wide : boolean
? ? Whether to use the Unicode East Asian Width to calculate the display text
? ? width.
? ? Enabling this may affect to the performance (default: False)
? ? [default: False] [currently: False]
display.unicode.east_asian_width : boolean
? ? Whether to use the Unicode East Asian Width to calculate the display text
? ? width.
? ? Enabling this may affect to the performance (default: False)
? ? [default: False] [currently: False]
display.width : int
? ? Width of the display in characters. In case python/IPython is running in
? ? a terminal this can be set to None and pandas will correctly auto-detect
? ? the width.
? ? Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
? ? terminal and hence it is not possible to correctly detect the width.
? ? [default: 80] [currently: 80]
io.excel.xls.writer : string
? ? The default Excel writer engine for 'xls' files. Available options:
? ? 'xlwt' (the default).
? ? [default: xlwt] [currently: xlwt]
io.excel.xlsm.writer : string
? ? The default Excel writer engine for 'xlsm' files. Available options:
? ? 'openpyxl' (the default).
? ? [default: openpyxl] [currently: openpyxl]
io.excel.xlsx.writer : string
? ? The default Excel writer engine for 'xlsx' files. Available options:
? ? 'xlsxwriter' (the default), 'openpyxl'.
? ? [default: xlsxwriter] [currently: xlsxwriter]
io.hdf.default_format : format
? ? default format writing format, if None, then
? ? put will default to 'fixed' and append will default to 'table'
? ? [default: None] [currently: None]
io.hdf.dropna_table : boolean
? ? drop ALL nan rows when appending to a table
? ? [default: False] [currently: False]
mode.chained_assignment : string
? ? Raise an exception, warn, or no action if trying to use chained assignment,
? ? The default is warn
? ? [default: warn] [currently: warn]
mode.sim_interactive : boolean
? ? Whether to simulate interactive mode for purposes of testing
? ? [default: False] [currently: False]
mode.use_inf_as_null : boolean
? ? True means treat None, NaN, INF, -INF as null (old way),
? ? False means None and NaN are null, but INF, -INF are not null
? ? (new way).
? ? [default: False] [currently: False]