使用ccxt爬取交易所K线数据
目标: 爬取Bybit交易所的行情数据, 对数据进行处理.
import pandas as pd
import time
import os
import datetime
import ccxt
pd.set_option('expand_frame_repr', False)
def crawl_bybit_datas(symbol, start_time, end_time):
"""
爬取交易所数据的方法.
:param symbol: 请求的symbol: like BTC/USDT, ETH/USD等。
:param start_time: like 2018-1-1
:param end_time: like 2019-1-1
:return:
"""
print(ccxt.__version__) # 1.18.1213 1.26.50
# pip install ccxt==1.26.50 通过这个命令来安装最新的版本.
# exchange_class = getattr(ccxt, 'bybit') # 获取交易所的名称 ccxt.binance
# exchange = exchange_class() # 交易所的类. 类似 ccxt.bitfinex()
exchange = ccxt.bybit()
print(exchange)
# exit()
current_path = os.getcwd()
file_dir = os.path.join(current_path, symbol.replace('/', ''))
print(file_dir)
if not os.path.exists(file_dir):
# 如果这个文件路径不存在,则创建这个文件夹,来存放数据.
os.makedirs(file_dir)
start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d')
end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d')
start_time_stamp = int(time.mktime(start_time.timetuple())) * 1000
end_time_stamp = int(time.mktime(end_time.timetuple())) * 1000
limit_count = 200 # bybit 请求的数据有限制,每次只能请求200个.
while True:
try:
print(start_time_stamp)
data = exchange.fetch_ohlcv(symbol, timeframe='1m', since=start_time_stamp, limit=limit_count)
df = pd.DataFrame(data)
df.rename(columns={0: 'open_time', 1: 'open', 2: 'high', 3: 'low', 4: 'close', 5: 'volume'}, inplace=True)
start_time_stamp = int(df.iloc[-1]['open_time']) # 获取下一个次请求的时间.
filename = str(start_time_stamp) + '.csv'
save_file_path = os.path.join(file_dir, filename)
print("文件保存路径为:%s" % save_file_path)
df.set_index('open_time', drop=True, inplace=True)
df.to_csv(save_file_path)
if start_time_stamp > end_time_stamp:
print("完成数据的请求.")
break
time.sleep(0.2) # 1/25
except Exception as error:
print(error)
time.sleep(10)
def sample_datas(symbol):
"""
:param exchange_name:
:param symbol:
:return:
"""
path = os.path.join(os.getcwd(), symbol.replace('/', ''))
print(path)
file_paths = []
for root, dirs, files in os.walk(path):
if files:
for file in files:
if file.endswith('.csv'):
file_paths.append(os.path.join(path, file))
file_paths = sorted(file_paths)
all_df = pd.DataFrame()
for file in file_paths:
df = pd.read_csv(file)
all_df = all_df.append(df, ignore_index=True)
all_df = all_df.sort_values(by='open_time', ascending=True)
print(all_df)
return all_df
# for index, item in all_df.iterrows():
# try:
# dt = (pd.to_datetime(item['open_time'], unit='ms'))
# print(dt)
# dt = datetime.datetime.strptime(str(dt), '%Y-%m-%d %H:%M:%S') # 2018-01-01 17:36:00:42
# print(dt)
# except:
# dt = (pd.to_datetime(item['open_time'], unit='ms'))
# print(dt)
def clear_datas(symbol):
df = sample_datas(symbol)
# print(df)
# exit()
# df['open_time'] = df['open_time'].apply(lambda x: time.mktime(x.timetuple()))
# # 日期.timetuple() 这个用法 通过它将日期转换成时间元组
# # print(df)
# df['open_time'] = df['open_time'].apply(lambda x: (x // 60) * 60 * 1000)
df['open_time'] = df['open_time'].apply(lambda x: (x // 60) * 60) # 获取整分的数据.
print(df)
df['Datetime'] = pd.to_datetime(df['open_time'], unit='ms') + pd.Timedelta(hours=8) # 把UTC时间转成北京时间.
df['Datetime'] = df['Datetime'].apply(lambda x: str(x)[0:19]) # 2018-11-15 00:47:0034, 通过截取字符串长度.
df.drop_duplicates(subset=['open_time'], inplace=True)
df.set_index('Datetime', inplace=True)
print("*" * 20)
print(df)
symbol_path = symbol.replace('/', '')
df.to_csv(f'{symbol_path}_1min_data.csv')
if __name__ == '__main__':
# crawl_bybit_datas('BTC/USD', '2018-11-15', '2020-4-18') # ccxt symbol BTC/USDT
clear_datas('BTC/USD')
参考 https://github.com/ramoslin02/51bitquant/blob/master/bybit/crawl_kline_data.py