Detailed MLDE and Data warehouse features (#608)

Co-authored-by: Damien Burks <[email protected]>
finos · Jan 7, 2025 · 5d63205 · 5d63205
1 parent 2be7b51
commit 5d63205
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 54 deletions.
diff --git a/services/ai-ml/mlde/features.yaml b/services/ai-ml/mlde/features.yaml
@@ -1,40 +1,40 @@
 common_features:
-  - CCC.F03  # Access/Activity Logs
-  - CCC.F06  # Identity-Based Access Control
-  - CCC.F08  # Multi-zone Deployment
-  - CCC.F09  # Monitoring
-  - CCC.F10  # Logging
+  - CCC.F03 # Access/Activity Logs
+  - CCC.F06 # Identity-Based Access Control
+  - CCC.F08 # Multi-zone Deployment
+  - CCC.F09 # Monitoring
+  - CCC.F10 # Logging
 
 features:
-  - id: CCC.MLDE.F01 #Managed Notebook Environments
+  - id: CCC.MLDE.F01
     title: Managed Notebook Environments
     description: |
       Provides fully managed notebook instances specifically designed
       for machine learning development, eliminating the need to manage
       underlying infrastructure.
 
-  - id: CCC.MLDE.F02 #Pre-configured Machine Learning Libraries
+  - id: CCC.MLDE.F02
     title: Pre-configured Machine Learning Libraries
     description: |
       Offers environments pre-installed with popular machine
       learning libraries and frameworks such as TensorFlow, PyTorch,
       and Scikit-learn, optimized for ML tasks.
 
-  - id: CCC.MLDE.F03 #Integrated Experiment Management
+  - id: CCC.MLDE.F03
     title: Integrated Experiment Management
     description: |
       Facilitates tracking and management of machine learning
       experiments, including parameters, metrics, and artifacts,
       within the development environment.
 
-  - id: CCC.MLDE.F04 #Model Training and Deployment Integration
+  - id: CCC.MLDE.F04
     title: Model Training and Deployment Integration
     description: |
       Supports seamless transition from model development to
       training and deployment, allowing models to be trained and
       deployed directly from the MLDE.
 
-  - id: CCC.MLDE.F05 #Automated Machine Learning (AutoML) Features
+  - id: CCC.MLDE.F05
     title: Automated Machine Learning (AutoML) Features
     description: |
       Offers AutoML functionalities to automatically build,

diff --git a/services/ai-ml/mlde/metadata.yaml b/services/ai-ml/mlde/metadata.yaml
@@ -0,0 +1,19 @@
+title: Machine Learning Development Environment
+id: CCC.MLDE
+description: |
+  Machine Learning Development Environment refers to the suite of tools,
+  infrastructure, and processes that facilitate the development, testing,
+  deployment, and maintenance of machine learning models.
+release_details:
+  - version: "2025.06"
+    assurance_level: None
+    threat_model_url: None
+    threat_model_author: None
+    red_team: None
+    red_team_exercise_url: None
+    release_manager:
+      name: Damien Burks
+      github_id: damienjburks
+      company: Citi
+      summary: None
+    change_log: []
diff --git a/services/ai-ml/service-families.yaml b/services/ai-ml/service-families.yaml
@@ -41,3 +41,16 @@ service_categories:
           - Github Copilot for Azure
       - Google Cloud:
           - Google Duet AI
+  - id: CCC.MLDE
+    title: Machine Learning Development Environment
+    description: |
+      Services designed to facilitate the development, testing , deployment and
+      maintenance of machine learning models, to improve efficiency throughout
+      the machine learning lifecycle.
+    examples:
+      - AWS:
+          - Amazon SageMaker
+      - Azure:
+          - Azure Machine Learning
+      - Google Cloud:
+          - Google Vertex AI
diff --git a/services/database/warehouse/features.yaml b/services/database/warehouse/features.yaml
@@ -1,61 +1,111 @@
 common_features:
-  - CCC.F01  # Encryption in Transit Enabled by Default
-  - CCC.F02  # Encryption at Rest Enabled by Default
-  - CCC.F03  # Access/Activity Logs
-  - CCC.F04  # Transaction Rate Limits
-  - CCC.F06  # Identity-Based Access Control
-  - CCC.F07  # Event Notifications
-  - CCC.F08  # Multi-zone Deployment
-  - CCC.F09  # Monitoring
-  - CCC.F11  # Backup
-  - CCC.F12  # Restore
-  - CCC.F14  # API Access
-  - CCC.F21  # Replication
+  - CCC.F01 # Encryption in Transit Enabled by Default
+  - CCC.F02 # Encryption at Rest Enabled by Default
+  - CCC.F03 # Access/Activity Logs
+  - CCC.F04 # Transaction Rate Limits
+  - CCC.F06 # Identity-Based Access Control
+  - CCC.F07 # Event Notifications
+  - CCC.F08 # Multi-zone Deployment
+  - CCC.F11 # Backup
+  - CCC.F12 # Restore
+  - CCC.F14 # API Access
+  - CCC.F19 # On-demand Scaling
+  - CCC.F21 # Replication
 
 features:
-  - id: CCC.DataWar.F01  # Dataset Management
-    title: Dataset Management
+  - id: CCC.DataWar.F01
+    title: Centralized Data Repository
     description: |
-      Provides the ability to create, manage, and organize
-      datasets within the Cloud Data Warehouse for structured
-      data storage.
-    # references:  # Temporary for validation purposes
-    #   AWS: https://docs.aws.amazon.com/redshift/latest/dg/c-getting-started-using-spectrum.html
-    #   GCP: https://cloud.google.com/bigquery/docs/datasets
-    #   Azure: https://learn.microsoft.com/en-us/azure/synapse-analytics/synapse-link/connect-synapse-link-sql-database
+      Acts as a centralized repository where data from various
+      sources is consolidated, making it easier to manage and
+      analyze large volumes of data.
 
+  - id: CCC.DataWar.F02
+    title: Optimized Query Performance
+    description: |
+      Handles complex queries on large datasets efficiently using
+      techniques such as indexing and partitioning.
 
-  - id: CCC.DataWar.F02  # Managed Views
-    title: Managed Views
+  - id: CCC.DataWar.F03
+    title: Scalability
     description: |
-      Supports creating views that allow sharing query results
-      with specific users or groups without providing access to
-      underlying data.
-    # references:
-    #   AWS: https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_VIEW.html
-    #   GCP: https://cloud.google.com/bigquery/docs/authorized-views
-    #   Azure: https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/create-use-views
+      Ability to scale with growing data volumes and handle multiple
+      queries simultaneously without compromising the performance.
 
+  - id: CCC.DataWar.F04
+    title: Column Storage
+    description: |
+      Stores data in columns rather than rows for efficient
+      data retrieval.
 
-  - id: CCC.DataWar.F03  # Column-Level Security
-    title: Column-Level Security
+  - id: CCC.DataWar.F05
+    title: SQL Based Querying
     description: |
-      Allows setting access policies at the column level to restrict access to sensitive data fields within tables.
-# references:
-#   AWS: https://aws.amazon.com/blogs/big-data/achieve-finer-grained-data-
-#      security-with-column-level-access-control-in-amazon-redshift/
-#   GCP: https://cloud.google.com/bigquery/docs/column-level-security
-#   Azure: https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/column-level-security
+      Supports SQL based querying on the data sets with specific
+      enhancements and optimization for data warehousing.
 
+  - id: CCC.DataWar.F06
+    title: Data Types
+    description: |
+      Ability to store processed structured and semi-structured
+      data optimized for querying and analysis.
+
+  - id: CCC.DataWar.F07
+    title: Massively Parallel Processing (MPP)
+    description: |
+      Distributes queries across multiple nodes for increased performance.
+
+  - id: CCC.DataWar.F08
+    title: Materialized Views
+    description: |
+      Ability to store results od a query into physical tables for faster
+      data retrieval and improved query performance for complex queries.
+
+  - id: CCC.DataWar.F09
+    title: Column-Level Security
+    description: |
+      Allows setting access policies at the column level to
+      restrict access to sensitive data fields within tables.
 
-  - id: CCC.DataWar.F05  # Row-Level Security
+  - id: CCC.DataWar.F10
     title: Row-Level Security
     description: |
       Enables setting access policies at the row level to
       control access to subsets of data within a table based
       on user roles.
-# references:
-#   AWS: https://docs.aws.amazon.com/redshift/latest/dg/t_rls.html
-#   GCP: https://cloud.google.com/bigquery/docs/row-level-security-intro
-#   Azure: https://techcommunity.microsoft.com/t5/azure-synapse-analytics-
-#     blog/how-to-implement-row-level-security-in-serverless-sql-pools/ba-p/2354759
+
+  - id: CCC.DataWar.F11
+    title: Integration with Data Sources
+    description: |
+      Seamless integration with various data sources such as object
+      storage, relational and non-relational databases, data streams
+      and data lakes.
+
+  - id: CCC.DataWar.F12
+    title: Integration with ETL
+    description: |
+      Integration with services that perform extract, transform and
+      load data from various sources into the data warehouse. Unstructured
+      data in transformed to structured or semi-structured data before
+      ingestion to the data warehouse using ETL tools.
+
+  - id: CCC.DataWar.F13
+    title: Integration with ML
+    description: |
+      Build-in integration with machine learning services for enhanced
+      processing of large volumes of complex data with ML models for
+      predictive analytics, automated insights and more. ML can be used
+      in data cleansing and transformation for improved data quality as well.
+
+  - id: CCC.DataWar.F14
+    title: Real-time Monitoring
+    description: |
+      Ability to continuously track and analyze data as it is ingested,
+      processed and stored to ensure data quality, operational efficiency,
+      scalability and security.
+
+  - id: CCC.DataWar.F15
+    title: Cross-Region Replication
+    description: |
+      Ability to replicate data to multiple regions for high availability,
+      disaster recovery and low-latency access.
diff --git a/services/database/warehouse/metadata.yaml b/services/database/warehouse/metadata.yaml
@@ -0,0 +1,19 @@
+title: Data Warehouse
+id: CCC.DataWar
+description: |
+  A data warehouse is a centralized repository designed to
+  store, manage, and analyze large volumes of data from
+  various sources.
+release_details:
+  - version: "2025.06"
+    assurance_level: None
+    threat_model_url: None
+    threat_model_author: None
+    red_team: None
+    red_team_exercise_url: None
+    release_manager:
+      name: Damien Burks
+      github_id: damienjburks
+      company: Citi
+      summary: None
+    change_log: []