DhanushMahesh commited on
Commit
3531f4a
·
1 Parent(s): de245a1

feat: update execution counts and enhance data processing in main notebook

Browse files
Files changed (2) hide show
  1. data/Kollywood 2020-2022 songs.csv +0 -0
  2. main.ipynb +10 -4
data/Kollywood 2020-2022 songs.csv ADDED
The diff for this file is too large to render. See raw diff
 
main.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 7,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -36,7 +36,7 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 9,
40
  "metadata": {},
41
  "outputs": [],
42
  "source": [
@@ -117,7 +117,7 @@
117
  },
118
  {
119
  "cell_type": "code",
120
- "execution_count": 12,
121
  "metadata": {},
122
  "outputs": [
123
  {
@@ -136,7 +136,13 @@
136
  "\n",
137
  "df = pd.concat([df1, df2, df3])\n",
138
  "logger.info(f\"Concatenated DataFrame shape: {df.shape}\")\n",
139
- "logger.info(f\"Unique Track URIs: {df['Track URI'].unique().shape}\")"
 
 
 
 
 
 
140
  ]
141
  },
142
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
36
  },
37
  {
38
  "cell_type": "code",
39
+ "execution_count": 3,
40
  "metadata": {},
41
  "outputs": [],
42
  "source": [
 
117
  },
118
  {
119
  "cell_type": "code",
120
+ "execution_count": null,
121
  "metadata": {},
122
  "outputs": [
123
  {
 
136
  "\n",
137
  "df = pd.concat([df1, df2, df3])\n",
138
  "logger.info(f\"Concatenated DataFrame shape: {df.shape}\")\n",
139
+ "logger.info(f\"Unique Track URIs: {df['Track URI'].unique().shape}\")\n",
140
+ "\n",
141
+ "logger.info(f\"Before dropping duplicates: {df.shape}\")\n",
142
+ "df = df.drop_duplicates(subset=['Track URI'])\n",
143
+ "logger.info(f\"Dropped duplicates DataFrame shape: {df.shape}\")\n",
144
+ "\n",
145
+ "df.to_csv(\"data/Kollywood 2020-2022 songs.csv\", index=False)"
146
  ]
147
  },
148
  {