Skip to content

Commit

Permalink
data duplication mitigated
Browse files Browse the repository at this point in the history
  • Loading branch information
j-beastman committed Jun 18, 2024
1 parent 4ce1bba commit b6df269
Showing 1 changed file with 8 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,11 @@ def create_or_update_modeling_dataset(modeling_dataset_name: str, #TODO: change
staging_data['likeDiff'] = staging_data.groupby('video_id')['likeCount'].diff()
staging_data['commentDiff'] = staging_data.groupby('video_id')['commentCount'].diff()

staging_data = staging_data.sort_values(by=["viewCount", "video_id"])
staging_data = staging_data.drop_duplicates(subset=["video_id", "viewCount", "as_of_datetime"])

staging_data = staging_data.drop_duplicates()
staging_data = staging_data.groupby('video_id').apply(lambda group: group.iloc[::3]).reset_index(drop=True)

staging_data.fillna(0, inplace=True)

dataset = dr.Dataset.create_version_from_in_memory_data(modeling_dataset_id, staging_data)

Expand All @@ -92,17 +94,17 @@ def remove_old_retraining_data(endpoint: str,
for dataset_name in datasets_to_check:
data_id = _check_if_dataset_exists(dataset_name)

url = f"{endpoint}/datasets/{data_id: str}/versions/"
url = f"{endpoint}/datasets/{data_id}/versions/"
dataset_versions = client.get(url).json()

logger.info(f"Found {dataset_versions['count']} versions of {data_id: str}")
logger.info(f"Found {dataset_versions['count']} versions of {data_id}")

if dataset_versions['count'] > 75:
sorted_versions = sorted(dataset_versions['data'], key=lambda x: pd.to_datetime(x['creationDate']))
for version in sorted_versions[:-50]:
url = f"{endpoint}/datasets/{data_id: str}/versions/{version['versionId']}"
url = f"{endpoint}/datasets/{data_id}/versions/{version['versionId']}"
client.delete(url)
logger.info(f"Deleted {dataset_versions['count'] - 50} versions of {data_id: str}")
logger.info(f"Deleted {dataset_versions['count'] - 50} versions of {data_id}")



0 comments on commit b6df269

Please sign in to comment.