diff --git a/samples/DynamoDBImportCSV/CSVtoDynamoDB.json b/samples/DynamoDBImportCSV/CSVtoDynamoDB.json index 1f7ffb6..7d195d3 100644 --- a/samples/DynamoDBImportCSV/CSVtoDynamoDB.json +++ b/samples/DynamoDBImportCSV/CSVtoDynamoDB.json @@ -2,29 +2,13 @@ "objects": [ { "myComment" : "Activity used to run the hive script to import CSV data", - "output": { - "ref": "DataNodeId_cnlSW" - }, - "input": { - "ref": "DataNodeId_1ERqq" - }, - "name": "TableRestoreActivity", + "name": "TableImportActivity", "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myDDBColDefn})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\\n' LOCATION '#{myInputS3Loc}';\n \nINSERT OVERWRITE TABLE tempHiveTable SELECT * FROM s3TempTable;", - "id": "TableRestoreActivity", - "runsOn": { "ref" : "EmrClusterForRestore" }, + "id": "TableImportActivity", + "runsOn": { "ref" : "EmrClusterForImport" }, "stage": "false", "type": "HiveActivity" }, - { - "myComment" : "The DynamoDB table from which we need to import data from", - "dataFormat": { - "ref": "DDBExportFormat" - }, - "name": "DynamoDB", - "id": "DataNodeId_1ERqq", - "type": "DynamoDBDataNode", - "tableName": "#{myDDBTableName}" - }, { "failureAndRerunMode": "CASCADE", "resourceRole": "DataPipelineDefaultResourceRole", @@ -35,38 +19,14 @@ "id": "Default" }, { - "name": "EmrClusterForRestore", + "name": "EmrClusterForImport", "coreInstanceType": "m1.medium", "coreInstanceCount": "1", "masterInstanceType": "m1.medium", "releaseLabel": "emr-4.4.0", - "id": "EmrClusterForRestore", + "id": "EmrClusterForImport", "type": "EmrCluster", "terminateAfter": "2 Hours" - }, - { - "myComment" : "The S3 path from which we import data from", - "directoryPath": "#{myInputS3Loc}", - "dataFormat": { - "ref": "DataFormatId_xqWRk" - }, - "name": "S3DataNode", - "id": "DataNodeId_cnlSW", - "type": "S3DataNode" - }, - { - "myComment" : "Format for the S3 Path", - "name": "DefaultDataFormat1", - "column": "not_used STRING", - "id": "DataFormatId_xqWRk", - "type": "CSV" - }, - { - "myComment" : "Format for the DynamoDB table", - "name": "DDBExportFormat", - "id": "DDBExportFormat", - "column": "not_used STRING", - "type": "DynamoDBExportDataFormat" } ], "parameters": [ diff --git a/samples/DynamoDBImportCSV/README.md b/samples/DynamoDBImportCSV/README.md index 316f89d..96ac8b2 100644 --- a/samples/DynamoDBImportCSV/README.md +++ b/samples/DynamoDBImportCSV/README.md @@ -1,18 +1,18 @@ -#DynamoDB to CSV import +# DynamoDB from CSV import -##About the sample -The pipeline definition is used to import DynamoDB data to a CSV format. +## About the sample -##Running the pipeline +The pipeline definition is used to import DynamoDB data from a CSV format. + +## Running the pipeline Example DynamoDB table with keys: id User needs to provide: -1. Input S3 folder: The s3 folder prefix from which the CSV data is to be imported. -2. DynamoDB read throughput ratio: The throughput to be used for the import operation. -3. DynamoDB table name: The table name from which we need to import the data. -4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string -4. Dynamodb Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string -5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas. +1. Input S3 folder: The source S3 folder prefix from which the CSV data is to be imported. +2. DynamoDB table name: The name of the target dynamo table. +3. S3 Column Mappings: Comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string. +4. Dynamodb Column Mappings: Comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string +5. S3 to DynamoDB Column Mapping: Comma separated mapping from S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas. 6. Log Uri: S3 log path to capture the pipeline logs.