xxxxxxxxxx
Creating gs://ieee-ompi-datasets/...
xxxxxxxxxx
%%bash
# transfer data from Github to the bucket.
curl https://raw.githubusercontent.com/dvdbisong/IEEE-Carleton-and-OMPI-Machine-Learning-Workshop/master/data/crypto-markets/crypto-markets.csv | gsutil cp - gs://ieee-ompi-datasets/crypto-markets.csv
xxxxxxxxxx
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0Copying from <STDIN>
100 47.0M 100 47.0M 0 0 25.6M 0 0:00:01 0:00:01 --:--:-- 25.6M
/ [1 files][ 0.0 B/ 0.0 B]
Operation completed over 1 objects.
xxxxxxxxxx
# install the apache beam library and other important setup packages.
# restart the session after installing apache beam.
xxxxxxxxxx
%%bash
source activate py2env
pip install google-cloud-dataflow
pip uninstall -y google-cloud-dataflow
conda install -y pytz==2018.4
pip install apache-beam[gcp]
xxxxxxxxxx
# import relevant libraries
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
xxxxxxxxxx
# transformation code
def run(project, source_bucket, target_bucket):
import csv
options = {
'staging_location': 'gs://ieee-ompi-datasets/staging',
'temp_location': 'gs://ieee-ompi-datasets/temp',
'job_name': 'dataflow-crypto',
'project': project,
'max_num_workers': 24,
'teardown_policy': 'TEARDOWN_ALWAYS',
'no_save_main_session': True,
'runner': 'DataflowRunner'
}
options = beam.pipeline.PipelineOptions(flags=[], **options)
crypto_dataset = 'gs://{}/crypto-markets.csv'.format(source_bucket)
processed_ds = 'gs://{}/transformed-crypto-bitcoin'.format(target_bucket)
pipeline = beam.Pipeline(options=options)
# 0:slug, 3:date, 5:open, 6:high, 7:low, 8:close
rows = (
pipeline |
'Read from bucket' >> ReadFromText(crypto_dataset) |
'Tokenize as csv columns' >> beam.Map(lambda line: next(csv.reader([line]))) |
'Select columns' >> beam.Map(lambda fields: (fields[0], fields[3], fields[5], fields[6], fields[7], fields[8])) |
'Filter bitcoin rows' >> beam.Filter(lambda row: row[0] == 'bitcoin')
)
combined = (
rows |
'Write to bucket' >> beam.Map(lambda (slug, date, open, high, low, close): '{},{},{},{},{},{}'.format(
slug, date, open, high, low, close)) |
WriteToText(
file_path_prefix=processed_ds,
file_name_suffix=".csv", num_shards=2,
shard_name_template="-SS-of-NN",
header='slug, date, open, high, low, close')
)
pipeline.run()
xxxxxxxxxx
# execute transfomation
if __name__ == '__main__':
print 'Run pipeline on the cloud'
run(project='oceanic-sky-230504', source_bucket='ieee-ompi-datasets', target_bucket='ieee-ompi-datasets')