Spaces:
Build error
Build error
Upload datasets.ipynb
Browse files- datasets.ipynb +582 -94
datasets.ipynb
CHANGED
|
@@ -9,18 +9,9 @@
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
-
"execution_count":
|
| 13 |
"metadata": {},
|
| 14 |
-
"outputs": [
|
| 15 |
-
{
|
| 16 |
-
"name": "stderr",
|
| 17 |
-
"output_type": "stream",
|
| 18 |
-
"text": [
|
| 19 |
-
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 20 |
-
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 21 |
-
]
|
| 22 |
-
}
|
| 23 |
-
],
|
| 24 |
"source": [
|
| 25 |
"from datasets import load_dataset\n",
|
| 26 |
"import pandas as pd \n",
|
|
@@ -39,7 +30,7 @@
|
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"cell_type": "code",
|
| 42 |
-
"execution_count":
|
| 43 |
"metadata": {},
|
| 44 |
"outputs": [],
|
| 45 |
"source": [
|
|
@@ -50,7 +41,7 @@
|
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"cell_type": "code",
|
| 53 |
-
"execution_count":
|
| 54 |
"metadata": {},
|
| 55 |
"outputs": [],
|
| 56 |
"source": [
|
|
@@ -60,7 +51,7 @@
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
-
"execution_count":
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [],
|
| 66 |
"source": [
|
|
@@ -71,7 +62,7 @@
|
|
| 71 |
},
|
| 72 |
{
|
| 73 |
"cell_type": "code",
|
| 74 |
-
"execution_count":
|
| 75 |
"metadata": {},
|
| 76 |
"outputs": [],
|
| 77 |
"source": [
|
|
@@ -101,7 +92,7 @@
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"cell_type": "code",
|
| 104 |
-
"execution_count":
|
| 105 |
"metadata": {},
|
| 106 |
"outputs": [
|
| 107 |
{
|
|
@@ -151,7 +142,7 @@
|
|
| 151 |
},
|
| 152 |
{
|
| 153 |
"cell_type": "code",
|
| 154 |
-
"execution_count":
|
| 155 |
"metadata": {},
|
| 156 |
"outputs": [
|
| 157 |
{
|
|
@@ -206,12 +197,96 @@
|
|
| 206 |
" return train_collection,test_collection\n",
|
| 207 |
"\n",
|
| 208 |
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
| 209 |
-
"train_file_path = 'C:\\\\gitProjects\\\\
|
| 210 |
-
"test_file_path = 'C:\\\\gitProjects\\\\
|
| 211 |
"\n",
|
| 212 |
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
|
| 213 |
]
|
| 214 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
{
|
| 216 |
"cell_type": "markdown",
|
| 217 |
"metadata": {},
|
|
@@ -225,80 +300,466 @@
|
|
| 225 |
"metadata": {},
|
| 226 |
"outputs": [],
|
| 227 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 229 |
-
"from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
"\n",
|
| 231 |
-
"
|
| 232 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
"\n",
|
| 234 |
-
"#text dosyasını koleksiyon üzerinden çekme \n",
|
| 235 |
-
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
| 236 |
-
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
| 237 |
"class Database:\n",
|
| 238 |
" @staticmethod\n",
|
| 239 |
" def get_mongodb():\n",
|
| 240 |
-
"
|
| 241 |
-
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
| 242 |
"\n",
|
|
|
|
| 243 |
" @staticmethod\n",
|
| 244 |
-
" def
|
| 245 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
| 246 |
" client = MongoClient(mongo_url)\n",
|
| 247 |
" db = client[db_name]\n",
|
| 248 |
" collection = db[collection_name]\n",
|
| 249 |
-
"
|
| 250 |
-
"
|
| 251 |
-
"
|
| 252 |
-
"
|
| 253 |
-
" title_count = len(title_from_db)\n",
|
| 254 |
-
" return title_from_db, title_count\n",
|
| 255 |
" \n",
|
|
|
|
| 256 |
" @staticmethod\n",
|
| 257 |
-
" def
|
| 258 |
-
"
|
| 259 |
-
"
|
| 260 |
-
"
|
| 261 |
-
"
|
| 262 |
-
"
|
| 263 |
-
"
|
| 264 |
-
"
|
| 265 |
-
"
|
| 266 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
"\n",
|
| 268 |
"\n",
|
| 269 |
-
"#
|
| 270 |
-
"
|
| 271 |
-
"
|
|
|
|
| 272 |
"\n",
|
| 273 |
-
"#
|
| 274 |
-
"
|
| 275 |
-
"
|
| 276 |
-
"
|
| 277 |
-
"
|
| 278 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
]
|
| 280 |
},
|
| 281 |
{
|
| 282 |
-
"cell_type": "
|
|
|
|
| 283 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
"source": [
|
| 285 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
]
|
| 287 |
},
|
| 288 |
{
|
| 289 |
"cell_type": "code",
|
| 290 |
-
"execution_count":
|
| 291 |
"metadata": {},
|
| 292 |
"outputs": [
|
| 293 |
{
|
| 294 |
-
"
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
"
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
],
|
| 304 |
"source": [
|
|
@@ -315,6 +776,8 @@
|
|
| 315 |
" def get_mongodb():\n",
|
| 316 |
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
| 317 |
"\n",
|
|
|
|
|
|
|
| 318 |
" @staticmethod\n",
|
| 319 |
" def get_input_documents(limit=3):\n",
|
| 320 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
|
@@ -322,11 +785,11 @@
|
|
| 322 |
" db = client[db_name]\n",
|
| 323 |
" collection = db[collection_name]\n",
|
| 324 |
" cursor = collection.find().limit(limit)\n",
|
| 325 |
-
"
|
| 326 |
-
" document_count = len(
|
| 327 |
" \n",
|
| 328 |
" # Dökümanları isimlendir\n",
|
| 329 |
-
" named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(
|
| 330 |
" \n",
|
| 331 |
" return named_documents, document_count\n",
|
| 332 |
"\n",
|
|
@@ -353,7 +816,7 @@
|
|
| 353 |
" return Database.get_input_documents(limit)\n",
|
| 354 |
"\n",
|
| 355 |
"# Kullanım örneği\n",
|
| 356 |
-
"named_documents, document_count = Tf.get_input_documents(limit=
|
| 357 |
"\n",
|
| 358 |
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
| 359 |
"\n",
|
|
@@ -387,23 +850,30 @@
|
|
| 387 |
" for word, score in sorted_words[:3]:\n",
|
| 388 |
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
|
| 389 |
"\n",
|
|
|
|
|
|
|
| 390 |
"turkish_stop_words = [\n",
|
| 391 |
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
| 392 |
-
"
|
| 393 |
-
" '
|
| 394 |
-
" '
|
| 395 |
-
" '
|
| 396 |
-
" '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
"]\n",
|
| 398 |
"\n",
|
| 399 |
-
"#
|
| 400 |
-
"def calculate_tfidf(
|
| 401 |
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
| 402 |
-
" tfidf_matrix = vectorizer.fit_transform(
|
| 403 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 404 |
" return tfidf_matrix, feature_names\n",
|
| 405 |
"\n",
|
| 406 |
-
"
|
| 407 |
"#kelimelerin ortalama skorlarını hesaplama \n",
|
| 408 |
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
| 409 |
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
|
@@ -411,48 +881,54 @@
|
|
| 411 |
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
| 412 |
" return low_tfidf_words\n",
|
| 413 |
"\n",
|
| 414 |
-
"#kelimelerin güncellenmesi \n",
|
| 415 |
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
| 416 |
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
| 417 |
" return list(updated_stop_words)\n",
|
| 418 |
"\n",
|
| 419 |
"\n",
|
| 420 |
-
"
|
|
|
|
| 421 |
" stop_words = set(initial_stop_words)\n",
|
| 422 |
" for _ in range(iterations):\n",
|
| 423 |
-
" tfidf_matrix, feature_names = calculate_tfidf(
|
| 424 |
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
| 425 |
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
| 426 |
" return list(stop_words)\n",
|
| 427 |
-
"
|
| 428 |
"\n",
|
| 429 |
"\n",
|
| 430 |
"def main ():\n",
|
| 431 |
"\n",
|
|
|
|
| 432 |
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
| 433 |
"\n",
|
| 434 |
"# Dökümanları liste olarak al\n",
|
| 435 |
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
| 436 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
" #tf-ıdf hesaplama\n",
|
| 438 |
-
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,
|
| 439 |
"\n",
|
| 440 |
-
"
|
| 441 |
-
" named_documents, document_count = Database.get_input_documents(limit=3)\n",
|
| 442 |
"\n",
|
| 443 |
-
"
|
| 444 |
-
"
|
|
|
|
| 445 |
"\n",
|
| 446 |
-
"
|
| 447 |
-
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
| 448 |
"\n",
|
| 449 |
-
"
|
|
|
|
| 450 |
"\n",
|
| 451 |
"\n",
|
| 452 |
"# Sonuçları yazdır\n",
|
| 453 |
-
"
|
| 454 |
-
"
|
| 455 |
-
"
|
| 456 |
"\n",
|
| 457 |
" print(\"\\nDökümanlar Listesi:\")\n",
|
| 458 |
" print(documents_list)\n",
|
|
@@ -534,9 +1010,21 @@
|
|
| 534 |
},
|
| 535 |
{
|
| 536 |
"cell_type": "code",
|
| 537 |
-
"execution_count":
|
| 538 |
"metadata": {},
|
| 539 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
"source": [
|
| 541 |
"\n",
|
| 542 |
"#---------------------------------------------------------------------------------------------------------------------------------\n",
|
|
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
+
"execution_count": 6,
|
| 13 |
"metadata": {},
|
| 14 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"source": [
|
| 16 |
"from datasets import load_dataset\n",
|
| 17 |
"import pandas as pd \n",
|
|
|
|
| 30 |
},
|
| 31 |
{
|
| 32 |
"cell_type": "code",
|
| 33 |
+
"execution_count": 8,
|
| 34 |
"metadata": {},
|
| 35 |
"outputs": [],
|
| 36 |
"source": [
|
|
|
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"cell_type": "code",
|
| 44 |
+
"execution_count": 9,
|
| 45 |
"metadata": {},
|
| 46 |
"outputs": [],
|
| 47 |
"source": [
|
|
|
|
| 51 |
},
|
| 52 |
{
|
| 53 |
"cell_type": "code",
|
| 54 |
+
"execution_count": 10,
|
| 55 |
"metadata": {},
|
| 56 |
"outputs": [],
|
| 57 |
"source": [
|
|
|
|
| 62 |
},
|
| 63 |
{
|
| 64 |
"cell_type": "code",
|
| 65 |
+
"execution_count": 11,
|
| 66 |
"metadata": {},
|
| 67 |
"outputs": [],
|
| 68 |
"source": [
|
|
|
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"cell_type": "code",
|
| 95 |
+
"execution_count": 12,
|
| 96 |
"metadata": {},
|
| 97 |
"outputs": [
|
| 98 |
{
|
|
|
|
| 142 |
},
|
| 143 |
{
|
| 144 |
"cell_type": "code",
|
| 145 |
+
"execution_count": 13,
|
| 146 |
"metadata": {},
|
| 147 |
"outputs": [
|
| 148 |
{
|
|
|
|
| 197 |
" return train_collection,test_collection\n",
|
| 198 |
"\n",
|
| 199 |
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
| 200 |
+
"train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
|
| 201 |
+
"test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
|
| 202 |
"\n",
|
| 203 |
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
|
| 204 |
]
|
| 205 |
},
|
| 206 |
+
{
|
| 207 |
+
"cell_type": "code",
|
| 208 |
+
"execution_count": null,
|
| 209 |
+
"metadata": {},
|
| 210 |
+
"outputs": [],
|
| 211 |
+
"source": [
|
| 212 |
+
"import pandas as pd\n",
|
| 213 |
+
"from pymongo import MongoClient,errors\n",
|
| 214 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 215 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"# MongoDB bağlantı ve koleksiyon seçimi için fonksiyon\n",
|
| 218 |
+
"def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
|
| 219 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 220 |
+
" db = client[database_name]\n",
|
| 221 |
+
" train_collection = db[train_collection_name]\n",
|
| 222 |
+
" test_collection = db[test_collection_name]\n",
|
| 223 |
+
" return train_collection, test_collection\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"# Dataset'i MongoDB'ye yükleme fonksiyonu\n",
|
| 226 |
+
"def dataset_read(train_file_path, test_file_path):\n",
|
| 227 |
+
" try:\n",
|
| 228 |
+
" # MongoDB koleksiyonlarını al\n",
|
| 229 |
+
" train_collection, test_collection = get_mongodb()\n",
|
| 230 |
+
"\n",
|
| 231 |
+
" # Eğer koleksiyonlar zaten doluysa, veri yüklemesi yapma\n",
|
| 232 |
+
" if train_collection.estimated_document_count() > 0 or test_collection.estimated_document_count() > 0:\n",
|
| 233 |
+
" print(\"Veriler zaten yüklendi, işlem yapılmadı.\")\n",
|
| 234 |
+
" return train_collection, test_collection\n",
|
| 235 |
+
"\n",
|
| 236 |
+
" # Datasetleri oku\n",
|
| 237 |
+
" data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
|
| 238 |
+
" data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
|
| 239 |
+
"\n",
|
| 240 |
+
" # Verileri MongoDB'ye yükle\n",
|
| 241 |
+
" train_collection.insert_many(data_train.to_dict(\"records\"))\n",
|
| 242 |
+
" test_collection.insert_many(data_test.to_dict(\"records\"))\n",
|
| 243 |
+
"\n",
|
| 244 |
+
" print(f\"Veriler başarıyla {train_collection.name} koleksiyonuna yüklendi.\")\n",
|
| 245 |
+
" print(f\"Veriler başarıyla {test_collection.name} koleksiyonuna yüklendi.\")\n",
|
| 246 |
+
" \n",
|
| 247 |
+
" except errors.PyMongoError as e:\n",
|
| 248 |
+
" print(f\"Veri yükleme sırasında hata oluştu: {e}\")\n",
|
| 249 |
+
"\n",
|
| 250 |
+
" return train_collection, test_collection\n",
|
| 251 |
+
"\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
|
| 255 |
+
"class Database:\n",
|
| 256 |
+
" @staticmethod\n",
|
| 257 |
+
" def get_mongodb():\n",
|
| 258 |
+
" return get_mongodb()\n",
|
| 259 |
+
"\n",
|
| 260 |
+
" @staticmethod\n",
|
| 261 |
+
" def get_titles_and_texts():\n",
|
| 262 |
+
" # MongoDB bağlantısı ve koleksiyonları al\n",
|
| 263 |
+
" train_collection, _ = Database.get_mongodb()\n",
|
| 264 |
+
"\n",
|
| 265 |
+
" # Sorgu: Hem \"title\" hem de \"text\" alanı mevcut olan belgeler\n",
|
| 266 |
+
" query = {\"title\": {\"$exists\": True}, \"text\": {\"$exists\": True}}\n",
|
| 267 |
+
"\n",
|
| 268 |
+
" # Belirtilen alanları seçiyoruz: \"title\", \"text\"\n",
|
| 269 |
+
" cursor = train_collection.find(query, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
|
| 270 |
+
"\n",
|
| 271 |
+
" # Başlık ve metinleri doğru bir şekilde birleştiriyoruz\n",
|
| 272 |
+
" documents = [{\"title\": doc['title'], \"text\": doc['text']} for doc in cursor]\n",
|
| 273 |
+
" document_count = len(documents)\n",
|
| 274 |
+
" return documents, document_count\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
|
| 277 |
+
"train_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
|
| 278 |
+
"test_file_path = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
|
| 279 |
+
"\n",
|
| 280 |
+
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"# Veritabanından başlıklar ve metinler alınır\n",
|
| 283 |
+
"documents, document_count = Database.get_titles_and_texts()\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"# Sonuçların belirlenmesi\n",
|
| 286 |
+
"print(f\"Başlık ve metin çiftleri: {documents}\")\n",
|
| 287 |
+
"print(f\"Toplam çift sayısı: {document_count}\")\n"
|
| 288 |
+
]
|
| 289 |
+
},
|
| 290 |
{
|
| 291 |
"cell_type": "markdown",
|
| 292 |
"metadata": {},
|
|
|
|
| 300 |
"metadata": {},
|
| 301 |
"outputs": [],
|
| 302 |
"source": [
|
| 303 |
+
"\"\"\"@staticmethod\n",
|
| 304 |
+
" def get_input_titles():\n",
|
| 305 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
| 306 |
+
" query = {\"title\": {\"$exists\": True}}\n",
|
| 307 |
+
" cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
|
| 308 |
+
" title_from_db = [doc['title'] for doc in cursor]\n",
|
| 309 |
+
"\n",
|
| 310 |
+
" return title_from_db\"\"\"\n",
|
| 311 |
+
"\n",
|
| 312 |
+
"\"\"\"@staticmethod\n",
|
| 313 |
+
" def get_input_texts():\n",
|
| 314 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
| 315 |
+
" query = {\"texts\": {\"$exists\": True}}\n",
|
| 316 |
+
" cursor = collection.find(query, {\"texts\": 1, \"_id\": 0})\n",
|
| 317 |
+
" texts_from_db = [doc['texts'] for doc in cursor]\n",
|
| 318 |
+
" return texts_from_db\"\"\"\n",
|
| 319 |
+
" \n",
|
| 320 |
+
" #bin tane veri çekerek csv dosyası olarak kaydetme \n",
|
| 321 |
+
" \n",
|
| 322 |
+
" \n",
|
| 323 |
+
"\"\"\"@staticmethod\n",
|
| 324 |
+
" def get_titles_and_texts(batch_size=1000):\n",
|
| 325 |
+
"\n",
|
| 326 |
+
" \n",
|
| 327 |
+
" titles = Database.get_input_titles(batch_size=batch_size)\n",
|
| 328 |
+
" texts = Database.get_input_texts(batch_size=batch_size )\n",
|
| 329 |
+
" \n",
|
| 330 |
+
"\n",
|
| 331 |
+
"\n",
|
| 332 |
+
" def test_queries():\n",
|
| 333 |
+
"\n",
|
| 334 |
+
" collection = Database.get_mongodb(collection_name='train')\n",
|
| 335 |
+
" # Başlık sorgusu\n",
|
| 336 |
+
" titles_cursor = collection.find({\"title\": {\"$exists\": True}}, {\"title\": 1, \"_id\": 0})\n",
|
| 337 |
+
" titles = [doc['title'] for doc in titles_cursor]\n",
|
| 338 |
+
" \n",
|
| 339 |
+
"\n",
|
| 340 |
+
" # Metin sorgusu\n",
|
| 341 |
+
" texts_cursor = collection.find({\"text\": {\"$exists\": True}}, {\"text\": 1, \"_id\": 0})\n",
|
| 342 |
+
" texts = [doc['text'] for doc in texts_cursor]\n",
|
| 343 |
+
" \n",
|
| 344 |
+
" # Başlık ve metinlerin eşleşmesini sağlamak için zip kullanarak birleştiriyoruz\n",
|
| 345 |
+
" documents = [{\"title\": title, \"text\": text} for title, text in zip(titles, texts)]\n",
|
| 346 |
+
" document_count = len(documents)\n",
|
| 347 |
+
" return documents, document_count\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"Database.test_queries()\n",
|
| 350 |
+
"\n",
|
| 351 |
+
"# Veritabanından başlıklar ve metinler alınır\n",
|
| 352 |
+
"documents, document_count = Database.get_titles_and_texts(batch_size=1000)\n",
|
| 353 |
+
"\n",
|
| 354 |
+
"# Sonuçların belirlenmesi\n",
|
| 355 |
+
"print(f\"Başlık ve metin çiftleri: {documents}\")\n",
|
| 356 |
+
"print(f\"Toplam çift sayısı: {document_count}\")\"\"\""
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
{
|
| 360 |
+
"cell_type": "markdown",
|
| 361 |
+
"metadata": {},
|
| 362 |
+
"source": [
|
| 363 |
+
"Output'u vereceğimiz title ve textin kodu"
|
| 364 |
+
]
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"cell_type": "code",
|
| 368 |
+
"execution_count": 8,
|
| 369 |
+
"metadata": {},
|
| 370 |
+
"outputs": [
|
| 371 |
+
{
|
| 372 |
+
"name": "stdout",
|
| 373 |
+
"output_type": "stream",
|
| 374 |
+
"text": [
|
| 375 |
+
"0 **Pşıqo Ahecaqo** Pşıqo Ahecaqo (), Çerkes siy...\n",
|
| 376 |
+
"1 **Craterolophinae** Craterolophinae, Depastrid...\n",
|
| 377 |
+
"2 **Notocrabro** Notocrabro Crabronina oymağına ...\n",
|
| 378 |
+
"3 **Ibrahim Sissoko** İbrahim Sissoko (d. 30 Kas...\n",
|
| 379 |
+
"4 **Salah Cedid** Salah Cedid (1926-1993) (Arapç...\n",
|
| 380 |
+
"Name: combined, dtype: object\n",
|
| 381 |
+
"Veriler combined_output.csv dosyasına başarıyla kaydedildi.\n"
|
| 382 |
+
]
|
| 383 |
+
}
|
| 384 |
+
],
|
| 385 |
+
"source": [
|
| 386 |
+
"from pymongo import MongoClient\n",
|
| 387 |
+
"import pandas as pd\n",
|
| 388 |
+
"from tqdm.auto import tqdm, trange\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"# Database bağlantıları ve verileri çekme işlevleri\n",
|
| 391 |
+
"class Database:\n",
|
| 392 |
+
" @staticmethod\n",
|
| 393 |
+
" def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
|
| 394 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 395 |
+
" db = client[database_name]\n",
|
| 396 |
+
" train_collection = db[train_collection_name]\n",
|
| 397 |
+
" test_collection = db[test_collection_name]\n",
|
| 398 |
+
" return train_collection, test_collection\n",
|
| 399 |
+
"\n",
|
| 400 |
+
" def export_to_csv(batch_size=1000, output_file='combined_output.csv'):\n",
|
| 401 |
+
" train_collection, _ = Database.get_mongodb()\n",
|
| 402 |
+
" cursor = train_collection.find({}, {\"title\": 1, \"text\": 1, \"_id\": 0})\n",
|
| 403 |
+
" cursor = cursor.batch_size(batch_size) # Fix: Call batch_size on the cursor object\n",
|
| 404 |
+
"\n",
|
| 405 |
+
" # Verileri DataFrame'e dönüştürme\n",
|
| 406 |
+
" df= pd.DataFrame(list(cursor))\n",
|
| 407 |
+
" \n",
|
| 408 |
+
" # title ve text sütunlarını birleştirme\n",
|
| 409 |
+
" df['combined'] = df.apply(lambda row: f'**{row[\"title\"]}** {row[\"text\"]}', axis=1)\n",
|
| 410 |
+
" \n",
|
| 411 |
+
" #title,text and combined sütunlarını ayrı ayrı tutma\n",
|
| 412 |
+
" #df2['title_only'] = df2['title']\n",
|
| 413 |
+
" #df2['text_only'] = df2['text']\n",
|
| 414 |
+
" #df['combined']= output_file\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" # Sonuçları kontrol etme\n",
|
| 417 |
+
" combined_text= df['combined'] \n",
|
| 418 |
+
" # Print the combined column directly\n",
|
| 419 |
+
" \n",
|
| 420 |
+
" print(combined_text.head())\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" # Birleşmiş verileri CSV'ye kaydetme\n",
|
| 423 |
+
" \n",
|
| 424 |
+
" df.to_csv(output_file, index=False)\n",
|
| 425 |
+
" \n",
|
| 426 |
+
" print(f\"Veriler combined_output.csv dosyasına başarıyla kaydedildi.\")\n",
|
| 427 |
+
" \n",
|
| 428 |
+
"\n",
|
| 429 |
+
"# CSV dosyasını okuma ve birleştirme işlemi\n",
|
| 430 |
+
"Database.export_to_csv()"
|
| 431 |
+
]
|
| 432 |
+
},
|
| 433 |
+
{
|
| 434 |
+
"cell_type": "markdown",
|
| 435 |
+
"metadata": {},
|
| 436 |
+
"source": [
|
| 437 |
+
"TF-IDF HESAPLAMA"
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"cell_type": "code",
|
| 442 |
+
"execution_count": 20,
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"outputs": [
|
| 445 |
+
{
|
| 446 |
+
"name": "stderr",
|
| 447 |
+
"output_type": "stream",
|
| 448 |
+
"text": [
|
| 449 |
+
"[nltk_data] Downloading package wordnet to\n",
|
| 450 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
| 451 |
+
"[nltk_data] Package wordnet is already up-to-date!\n",
|
| 452 |
+
"[nltk_data] Downloading package omw-1.4 to\n",
|
| 453 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
| 454 |
+
"[nltk_data] Package omw-1.4 is already up-to-date!\n",
|
| 455 |
+
"[nltk_data] Downloading package stopwords to\n",
|
| 456 |
+
"[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n",
|
| 457 |
+
"[nltk_data] Package stopwords is already up-to-date!\n"
|
| 458 |
+
]
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"ename": "ValueError",
|
| 462 |
+
"evalue": "empty vocabulary; perhaps the documents only contain stop words",
|
| 463 |
+
"output_type": "error",
|
| 464 |
+
"traceback": [
|
| 465 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 466 |
+
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
| 467 |
+
"Cell \u001b[1;32mIn[20], line 100\u001b[0m\n\u001b[0;32m 97\u001b[0m documents, document_count \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mget_input_documents()\n\u001b[0;32m 99\u001b[0m \u001b[38;5;66;03m# Calculate TF-IDF and get feature names\u001b[39;00m\n\u001b[1;32m--> 100\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mDatabase\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mturkish_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 102\u001b[0m \u001b[38;5;66;03m# Extract keywords\u001b[39;00m\n\u001b[0;32m 103\u001b[0m keywords \u001b[38;5;241m=\u001b[39m Database\u001b[38;5;241m.\u001b[39mextract_keywords(tfidf_matrix, feature_names, stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n",
|
| 468 |
+
"Cell \u001b[1;32mIn[20], line 43\u001b[0m, in \u001b[0;36mDatabase.calculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[0;32m 41\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 42\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m,min_df\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m---> 43\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 44\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 45\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
|
| 469 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
|
| 470 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
| 471 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1372\u001b[0m, in \u001b[0;36mCountVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1364\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1365\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUpper case characters found in\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m vocabulary while \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlowercase\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is True. These entries will not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be matched with any documents\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1369\u001b[0m )\n\u001b[0;32m 1370\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m-> 1372\u001b[0m vocabulary, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1375\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
|
| 472 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1278\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1276\u001b[0m vocabulary \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(vocabulary)\n\u001b[0;32m 1277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m vocabulary:\n\u001b[1;32m-> 1278\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1279\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mempty vocabulary; perhaps the documents only contain stop words\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1280\u001b[0m )\n\u001b[0;32m 1282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m indptr[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m>\u001b[39m np\u001b[38;5;241m.\u001b[39miinfo(np\u001b[38;5;241m.\u001b[39mint32)\u001b[38;5;241m.\u001b[39mmax: \u001b[38;5;66;03m# = 2**31 - 1\u001b[39;00m\n\u001b[0;32m 1283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _IS_32BIT:\n",
|
| 473 |
+
"\u001b[1;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words"
|
| 474 |
+
]
|
| 475 |
+
}
|
| 476 |
+
],
|
| 477 |
+
"source": [
|
| 478 |
+
"#---------------------------güncel en yeni \n",
|
| 479 |
+
"from pymongo import MongoClient\n",
|
| 480 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 481 |
+
"from textblob import TextBlob as tb\n",
|
| 482 |
+
"import numpy as np\n",
|
| 483 |
+
"import math\n",
|
| 484 |
+
"from tqdm.auto import tqdm, trange\n",
|
| 485 |
+
"import tensorflow as tf\n",
|
| 486 |
+
"import nltk\n",
|
| 487 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
| 488 |
+
"from nltk.corpus import stopwords\n",
|
| 489 |
"\n",
|
| 490 |
+
"turkish_stop_words = stopwords.words('turkish')\n",
|
| 491 |
+
"\n",
|
| 492 |
+
"nltk.download('wordnet')\n",
|
| 493 |
+
"nltk.download('omw-1.4')\n",
|
| 494 |
+
"nltk.download('stopwords')\n",
|
| 495 |
+
"\n",
|
| 496 |
+
"\n",
|
| 497 |
+
"import matplotlib.pyplot as plt \n",
|
| 498 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 499 |
"class Database:\n",
|
| 500 |
" @staticmethod\n",
|
| 501 |
" def get_mongodb():\n",
|
| 502 |
+
" return 'mongodb://localhost:27017/', 'combined', 'combined_output'\n",
|
|
|
|
| 503 |
"\n",
|
| 504 |
+
" # Get input documents from MongoDB\n",
|
| 505 |
" @staticmethod\n",
|
| 506 |
+
" def get_input_documents(limit=1000):\n",
|
| 507 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
| 508 |
" client = MongoClient(mongo_url)\n",
|
| 509 |
" db = client[db_name]\n",
|
| 510 |
" collection = db[collection_name]\n",
|
| 511 |
+
" cursor = collection.find().limit(limit)\n",
|
| 512 |
+
" combined_text = [doc['text'] for doc in cursor]\n",
|
| 513 |
+
" document_count = len(combined_text)\n",
|
| 514 |
+
" return combined_text, document_count\n",
|
|
|
|
|
|
|
| 515 |
" \n",
|
| 516 |
+
" # Calculate TF-IDF and get feature names\n",
|
| 517 |
" @staticmethod\n",
|
| 518 |
+
" def calculate_tfidf(documents, stop_words):\n",
|
| 519 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000,min_df=2)\n",
|
| 520 |
+
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
| 521 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 522 |
+
" return tfidf_matrix, feature_names\n",
|
| 523 |
+
"\n",
|
| 524 |
+
" # Extract keywords using TF-IDF\n",
|
| 525 |
+
" def extract_keywords(tfidf_matrix, feature_names, top_n=10, stop_words=[]):\n",
|
| 526 |
+
" keywords = {}\n",
|
| 527 |
+
" for doc_idx, row in enumerate(tfidf_matrix):\n",
|
| 528 |
+
" filtered_feature_names = [name for name in feature_names if name.lower() not in stop_words]\n",
|
| 529 |
+
" scores = np.asarray(row.T.todense()).flatten()\n",
|
| 530 |
+
" sorted_indices = np.argsort(scores)[::-1]\n",
|
| 531 |
+
" top_features = sorted_indices[:top_n]\n",
|
| 532 |
+
" doc_keywords = [(filtered_feature_names[idx], scores[idx]) for idx in top_features]\n",
|
| 533 |
+
" keywords[f'document_{doc_idx+1}'] = doc_keywords\n",
|
| 534 |
+
" return keywords\n",
|
| 535 |
+
" \n",
|
| 536 |
+
" #zip keywords and combined text \n",
|
| 537 |
+
" \n",
|
| 538 |
+
" # Identify low TF-IDF words\n",
|
| 539 |
+
" @staticmethod\n",
|
| 540 |
+
" def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
| 541 |
+
" avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
|
| 542 |
+
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
| 543 |
+
" return low_tfidf_words\n",
|
| 544 |
+
" \n",
|
| 545 |
+
" # Update stop words with low TF-IDF words\n",
|
| 546 |
+
" @staticmethod\n",
|
| 547 |
+
" def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
| 548 |
+
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
| 549 |
+
" return list(updated_stop_words)\n",
|
| 550 |
"\n",
|
| 551 |
"\n",
|
| 552 |
+
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
| 553 |
+
"#Term Frequency (TF): Bir kelimenin belli bir dökümanda tekrar etme değeri\n",
|
| 554 |
+
"#Inverse Document Frequency (IDF):bir kelimenin tüm dökümanlar arasındaki yaygınlığı Nadir bulunan kelimeler, daha yüksek IDF değerine sahip olur.\n",
|
| 555 |
+
"#tf-ıdf skoru ise bu ikisinin çarpımıdır.\n",
|
| 556 |
"\n",
|
| 557 |
+
" #buraya eşik değer belirlenmeli\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"\n",
|
| 560 |
+
"turkish_stop_words = [\n",
|
| 561 |
+
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
| 562 |
+
" 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
|
| 563 |
+
" 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
|
| 564 |
+
" 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
|
| 565 |
+
" 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
|
| 566 |
+
" 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
|
| 567 |
+
" 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
|
| 568 |
+
" 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
|
| 569 |
+
" 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
|
| 570 |
+
" 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
|
| 571 |
+
" 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
|
| 572 |
+
"]\n",
|
| 573 |
+
"# Get input documents\n",
|
| 574 |
+
"documents, document_count = Database.get_input_documents()\n",
|
| 575 |
+
"\n",
|
| 576 |
+
"# Calculate TF-IDF and get feature names\n",
|
| 577 |
+
"tfidf_matrix, feature_names = Database.calculate_tfidf(documents, turkish_stop_words)\n",
|
| 578 |
+
"\n",
|
| 579 |
+
"# Extract keywords\n",
|
| 580 |
+
"keywords = Database.extract_keywords(tfidf_matrix, feature_names, stop_words=turkish_stop_words)\n",
|
| 581 |
+
"print(keywords)\n",
|
| 582 |
+
"\n",
|
| 583 |
+
"# Identify low TF-IDF words\n",
|
| 584 |
+
"low_tfidf_words = Database.identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
| 585 |
+
"print(low_tfidf_words)\n",
|
| 586 |
+
"\n",
|
| 587 |
+
"# Update stop words\n",
|
| 588 |
+
"updated_stop_words = Database.update_stop_words(turkish_stop_words, low_tfidf_words)\n",
|
| 589 |
+
"print(updated_stop_words) "
|
| 590 |
]
|
| 591 |
},
|
| 592 |
{
|
| 593 |
+
"cell_type": "code",
|
| 594 |
+
"execution_count": 15,
|
| 595 |
"metadata": {},
|
| 596 |
+
"outputs": [
|
| 597 |
+
{
|
| 598 |
+
"ename": "TypeError",
|
| 599 |
+
"evalue": "unhashable type: 'set'",
|
| 600 |
+
"output_type": "error",
|
| 601 |
+
"traceback": [
|
| 602 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 603 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
| 604 |
+
"Cell \u001b[1;32mIn[15], line 162\u001b[0m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,keywords\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 162\u001b[0m tfidf_matrix, feature_names,keywords\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAnahtar Kelimler:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 165\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc, words \u001b[38;5;129;01min\u001b[39;00m keywords\u001b[38;5;241m.\u001b[39mitems():\n",
|
| 605 |
+
"Cell \u001b[1;32mIn[15], line 148\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 146\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 147\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 148\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 150\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
|
| 606 |
+
"Cell \u001b[1;32mIn[15], line 127\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 126\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21miterative_update\u001b[39m(documents, initial_stop_words, iterations\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m--> 127\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 128\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[0;32m 129\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m calculate_tfidf(documents, stop_words)\n",
|
| 607 |
+
"\u001b[1;31mTypeError\u001b[0m: unhashable type: 'set'"
|
| 608 |
+
]
|
| 609 |
+
}
|
| 610 |
+
],
|
| 611 |
"source": [
|
| 612 |
+
"\n",
|
| 613 |
+
"\n",
|
| 614 |
+
"\"\"\"class Tf:\n",
|
| 615 |
+
" @staticmethod\n",
|
| 616 |
+
" def tf(word, blob):\n",
|
| 617 |
+
" return blob.words.count(word) / len(blob.words)\n",
|
| 618 |
+
"\n",
|
| 619 |
+
" @staticmethod\n",
|
| 620 |
+
" def n_containing(word, bloblist):\n",
|
| 621 |
+
" return sum(1 for blob in bloblist if word in blob.words)\n",
|
| 622 |
+
"\n",
|
| 623 |
+
" @staticmethod\n",
|
| 624 |
+
" def idf(word, bloblist):\n",
|
| 625 |
+
" return math.log(len(bloblist) / (1 + Tf.n_containing(word, bloblist)))\n",
|
| 626 |
+
"\n",
|
| 627 |
+
" @staticmethod\n",
|
| 628 |
+
" def tfidf(word, blob, bloblist):\n",
|
| 629 |
+
" return Tf.tf(word, blob) * Tf.idf(word, bloblist)\n",
|
| 630 |
+
"\n",
|
| 631 |
+
" @staticmethod\n",
|
| 632 |
+
" def get_input_documents(limit=1000):\n",
|
| 633 |
+
" return Database.get_input_documents(limit)\"\"\"\n",
|
| 634 |
+
"\n",
|
| 635 |
+
"\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"\n",
|
| 639 |
+
" \"\"\"\n",
|
| 640 |
+
" Her döküman için anahtar kelimeleri seç.\n",
|
| 641 |
+
" :param tfidf_matrix: TF-IDF matris\n",
|
| 642 |
+
" :param feature_names: TF-IDF özellik isimleri\n",
|
| 643 |
+
" :param top_n: Her döküman için seçilecek anahtar kelime sayısı\n",
|
| 644 |
+
" :return: Anahtar kelimeler ve skorlari\n",
|
| 645 |
+
" \"\"\"\n",
|
| 646 |
+
" \n",
|
| 647 |
+
"\n",
|
| 648 |
+
"#--------------------------------------------------------------- burada aldığımız dökümanları listeliyoruz\n",
|
| 649 |
+
"# Dokümanları işleyerek TF-IDF hesaplama\n",
|
| 650 |
+
"#bloblist dökümanların bir listesi\n",
|
| 651 |
+
"\"\"\"bloblist = []\n",
|
| 652 |
+
"for i, blob in enumerate(bloblist):\n",
|
| 653 |
+
" print(\"Top words in document {}\".format(i + 1))\n",
|
| 654 |
+
" scores = {word: Tf.tfidf(word, blob, bloblist) for word in blob.words} #dökümanların içerisinde bulunan kelimeleri alır.\n",
|
| 655 |
+
" sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)\n",
|
| 656 |
+
" for word, score in sorted_words[:3]:\n",
|
| 657 |
+
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\"\"\"\n",
|
| 658 |
+
"\n",
|
| 659 |
+
"\n",
|
| 660 |
+
"# Dökümanları isimlendir\n",
|
| 661 |
+
"#named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
|
| 662 |
+
"\n",
|
| 663 |
+
"#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
|
| 664 |
+
"def calculate_tfidf(documents, stop_words):\n",
|
| 665 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
| 666 |
+
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
| 667 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 668 |
+
" return tfidf_matrix, feature_names\n",
|
| 669 |
+
"\n",
|
| 670 |
+
"#---------------------------------------------------------------------------------\n",
|
| 671 |
+
"#kelimelerin ortalama skorlarını hesaplama \n",
|
| 672 |
+
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
| 673 |
+
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
| 674 |
+
" avg_scores = np.mean(tfidf_matrix, axis=0).A1\n",
|
| 675 |
+
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
| 676 |
+
" return low_tfidf_words\n",
|
| 677 |
+
"\n",
|
| 678 |
+
"#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
|
| 679 |
+
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
| 680 |
+
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
| 681 |
+
" return list(updated_stop_words)\n",
|
| 682 |
+
"\n",
|
| 683 |
+
"\n",
|
| 684 |
+
"#bu kısım detaylandırılmalı \n",
|
| 685 |
+
"def iterative_update(documents, initial_stop_words, iterations=5):\n",
|
| 686 |
+
" stop_words = set(initial_stop_words)\n",
|
| 687 |
+
" for _ in range(iterations):\n",
|
| 688 |
+
" tfidf_matrix, feature_names = calculate_tfidf(documents, stop_words)\n",
|
| 689 |
+
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
| 690 |
+
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
| 691 |
+
" return list(stop_words)\n",
|
| 692 |
+
"\n",
|
| 693 |
+
"\n",
|
| 694 |
+
"\n",
|
| 695 |
+
"def main ():\n",
|
| 696 |
+
"\n",
|
| 697 |
+
" \n",
|
| 698 |
+
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
| 699 |
+
"\n",
|
| 700 |
+
"# Dökümanları liste olarak al\n",
|
| 701 |
+
" named_documents, _ = Tf.get_input_documents(limit=1000)\n",
|
| 702 |
+
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
| 703 |
+
"\n",
|
| 704 |
+
" #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
|
| 705 |
+
" initial_stop_words = turkish_stop_words\n",
|
| 706 |
+
" # Stop-words listesini iteratif olarak güncelleyin\n",
|
| 707 |
+
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
| 708 |
+
" #tf-ıdf hesaplama\n",
|
| 709 |
+
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
|
| 710 |
+
" keywords=extract_keywords(tfidf_matrix,feature_names,top_n=10)\n",
|
| 711 |
+
"\n",
|
| 712 |
+
" \n",
|
| 713 |
+
"\n",
|
| 714 |
+
" print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
|
| 715 |
+
" print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
|
| 716 |
+
" print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
|
| 717 |
+
"\n",
|
| 718 |
+
" return tfidf_matrix, feature_names,keywords\n",
|
| 719 |
+
"\n",
|
| 720 |
+
"if __name__==\"__main__\":\n",
|
| 721 |
+
" tfidf_matrix, feature_names,keywords= main()\n",
|
| 722 |
+
"\n",
|
| 723 |
+
" print(\"Anahtar Kelimler:\")\n",
|
| 724 |
+
" for doc, words in keywords.items():\n",
|
| 725 |
+
" print(f\"{doc}: {words}\")\n",
|
| 726 |
+
" \n",
|
| 727 |
+
"\n",
|
| 728 |
+
"#---------------------------------------------------------\n",
|
| 729 |
+
" \"\"\"blobs = [tb(doc) for doc in documents_list] # veya 'title' kullanarak başlıkları işleyebilirsiniz\n",
|
| 730 |
+
" all_words = set(word for blob in blobs for word in blob.words)\n",
|
| 731 |
+
"\n",
|
| 732 |
+
" tfidf_scores = {}\n",
|
| 733 |
+
" for word in all_words:\n",
|
| 734 |
+
" tfidf_scores[word] = [Tf.tfidf(word, blob, blobs) for blob in blobs]\n",
|
| 735 |
+
"\n",
|
| 736 |
+
" print(\"TF-IDF Skorları:\")\n",
|
| 737 |
+
" for word, scores in tfidf_scores.items():\n",
|
| 738 |
+
" print(f\"Kelime: {word}, Skorlar: {scores}\")\"\"\"\n"
|
| 739 |
]
|
| 740 |
},
|
| 741 |
{
|
| 742 |
"cell_type": "code",
|
| 743 |
+
"execution_count": 2,
|
| 744 |
"metadata": {},
|
| 745 |
"outputs": [
|
| 746 |
{
|
| 747 |
+
"ename": "InvalidParameterError",
|
| 748 |
+
"evalue": "The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead.",
|
| 749 |
+
"output_type": "error",
|
| 750 |
+
"traceback": [
|
| 751 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 752 |
+
"\u001b[1;31mInvalidParameterError\u001b[0m Traceback (most recent call last)",
|
| 753 |
+
"Cell \u001b[1;32mIn[2], line 155\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names,documents_list \n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 155\u001b[0m tfidf_matrix, feature_names,documents_list\u001b[38;5;241m=\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;66;03m# Sonuçları yazdır\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mİsimlendirilmiş Dökümanlar:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
| 754 |
+
"Cell \u001b[1;32mIn[2], line 142\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 140\u001b[0m initial_stop_words \u001b[38;5;241m=\u001b[39m turkish_stop_words\n\u001b[0;32m 141\u001b[0m \u001b[38;5;66;03m# Stop-words listesini iteratif olarak güncelleyin\u001b[39;00m\n\u001b[1;32m--> 142\u001b[0m final_stop_words \u001b[38;5;241m=\u001b[39m \u001b[43miterative_update\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minitial_stop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;66;03m#tf-ıdf hesaplama\u001b[39;00m\n\u001b[0;32m 144\u001b[0m tfidf_matrix, feature_names\u001b[38;5;241m=\u001b[39mcalculate_tfidf(documents_list,final_stop_words)\n",
|
| 755 |
+
"Cell \u001b[1;32mIn[2], line 124\u001b[0m, in \u001b[0;36miterative_update\u001b[1;34m(documents, initial_stop_words, iterations)\u001b[0m\n\u001b[0;32m 122\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(initial_stop_words)\n\u001b[0;32m 123\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(iterations):\n\u001b[1;32m--> 124\u001b[0m tfidf_matrix, feature_names \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 125\u001b[0m low_tfidf_words \u001b[38;5;241m=\u001b[39m identify_low_tfidf_words(tfidf_matrix, feature_names)\n\u001b[0;32m 126\u001b[0m stop_words \u001b[38;5;241m=\u001b[39m update_stop_words(stop_words, low_tfidf_words)\n",
|
| 756 |
+
"Cell \u001b[1;32mIn[2], line 103\u001b[0m, in \u001b[0;36mcalculate_tfidf\u001b[1;34m(documents, stop_words)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_tfidf\u001b[39m(documents, stop_words):\n\u001b[0;32m 102\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words, max_features\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10000\u001b[39m)\n\u001b[1;32m--> 103\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mvectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 104\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfidf_matrix, feature_names\n",
|
| 757 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
|
| 758 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1466\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1461\u001b[0m partial_fit_and_fitted \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 1462\u001b[0m fit_method\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpartial_fit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m _is_fitted(estimator)\n\u001b[0;32m 1463\u001b[0m )\n\u001b[0;32m 1465\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m global_skip_validation \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m partial_fit_and_fitted:\n\u001b[1;32m-> 1466\u001b[0m \u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[0;32m 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
| 759 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:666\u001b[0m, in \u001b[0;36mBaseEstimator._validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 658\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_validate_params\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 659\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Validate types and values of constructor parameters\u001b[39;00m\n\u001b[0;32m 660\u001b[0m \n\u001b[0;32m 661\u001b[0m \u001b[38;5;124;03m The expected type and values must be defined in the `_parameter_constraints`\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 664\u001b[0m \u001b[38;5;124;03m accepted constraints.\u001b[39;00m\n\u001b[0;32m 665\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 666\u001b[0m \u001b[43mvalidate_parameter_constraints\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 667\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parameter_constraints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_params\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdeep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcaller_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
| 760 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_param_validation.py:95\u001b[0m, in \u001b[0;36mvalidate_parameter_constraints\u001b[1;34m(parameter_constraints, params, caller_name)\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 90\u001b[0m constraints_str \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 91\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(c)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39mconstraints[:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 92\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 93\u001b[0m )\n\u001b[1;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidParameterError(\n\u001b[0;32m 96\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_name\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m parameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcaller_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 97\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstraints_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam_val\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 98\u001b[0m )\n",
|
| 761 |
+
"\u001b[1;31mInvalidParameterError\u001b[0m: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'o', 'den', 'an', 'şey', 'burada', 've', 'ah', 'ise', 'hiç', 'yine', 'biz', 'bu', 'da', 'genellikle', 'yılında', 'belirli', 'se', 'ne', 'kadar', 'neden', 'hem', 'aralar', 'yani', 'daha', 'araba', 'derken', 'dolayı', 'kısaca', 'karşı', 'niye', 'ki', 'bunu', 'buna', 'de', 'herhangi', 'önce', 'tabi', 'kime', 'biten', 'ben', 'ya', 'ya da', 'çünkü', 'mu', 'b', 'demek', 'fakat', 'şimdi', 'birlikte', 'her', 'bağlı', 'nasıl', 'şu', 'sadece', 'tüm', 'aslında', 'edilir', 'ama', 'bence', 'en', 'işte', 'gibi', 'ancak', 'birkaç', 'itibaren', 'mü', 'olabilir', 'bazı', 'oluşur', 'başlayan', 'yanı', 'olasılıkla', 'iyi', 'değil', 'eğer', 'yetenekli'} instead."
|
| 762 |
+
]
|
| 763 |
}
|
| 764 |
],
|
| 765 |
"source": [
|
|
|
|
| 776 |
" def get_mongodb():\n",
|
| 777 |
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
|
| 778 |
"\n",
|
| 779 |
+
"#--------------------------------------------------------------------------\n",
|
| 780 |
+
"#combined_text eklenmeli \n",
|
| 781 |
" @staticmethod\n",
|
| 782 |
" def get_input_documents(limit=3):\n",
|
| 783 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
|
|
|
|
| 785 |
" db = client[db_name]\n",
|
| 786 |
" collection = db[collection_name]\n",
|
| 787 |
" cursor = collection.find().limit(limit)\n",
|
| 788 |
+
" combined_text = [doc for doc in cursor]\n",
|
| 789 |
+
" document_count = len(combined_text)\n",
|
| 790 |
" \n",
|
| 791 |
" # Dökümanları isimlendir\n",
|
| 792 |
+
" named_documents = {f'döküman {i+1}': doc for i, doc in enumerate(combined_text)}\n",
|
| 793 |
" \n",
|
| 794 |
" return named_documents, document_count\n",
|
| 795 |
"\n",
|
|
|
|
| 816 |
" return Database.get_input_documents(limit)\n",
|
| 817 |
"\n",
|
| 818 |
"# Kullanım örneği\n",
|
| 819 |
+
"named_documents, document_count = Tf.get_input_documents(limit=1000)\n",
|
| 820 |
"\n",
|
| 821 |
"#tf-ıdf ile döküman içerisinden kelime seçme \n",
|
| 822 |
"\n",
|
|
|
|
| 850 |
" for word, score in sorted_words[:3]:\n",
|
| 851 |
" print(\"\\tWord: {}, TF-IDF: {}\".format(word, round(score, 5)))\n",
|
| 852 |
"\n",
|
| 853 |
+
"\n",
|
| 854 |
+
"#buraya eşik değer belirlenmeli\n",
|
| 855 |
"turkish_stop_words = [\n",
|
| 856 |
" 'ah', 'ama', 'an', 'ancak', 'araba', 'aralar', 'aslında', \n",
|
| 857 |
+
" 'b', 'başlayan', 'bağlı', 'bazı', 'belirli', 'ben', 'bence', \n",
|
| 858 |
+
" 'birkaç', 'birlikte', 'bunu', 'burada', 'biten', 'biz', \n",
|
| 859 |
+
" 'bu', 'buna', 'çünkü', 'da', 'de', 'demek', 'den', 'derken', \n",
|
| 860 |
+
" 'değil', 'daha', 'dolayı', 'edilir', 'eğer', 'en', 'fakat', \n",
|
| 861 |
+
" 'genellikle', 'gibi', 'hem', 'her', 'herhangi', 'hiç', 'ise', \n",
|
| 862 |
+
" 'işte', 'itibaren', 'iyi', 'kadar', 'karşı', 'ki', 'kime', \n",
|
| 863 |
+
" 'kısaca', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'niye', 'o', \n",
|
| 864 |
+
" 'olasılıkla', 'olabilir', 'oluşur', 'önce', 'şu', 'sadece', \n",
|
| 865 |
+
" 'se', 'şey', 'şimdi', 'tabi', 'tüm', 've', 'ya', 'ya da', \n",
|
| 866 |
+
" 'yanı', 'yani', 'yılında', 'yetenekli', 'yine'\n",
|
| 867 |
"]\n",
|
| 868 |
"\n",
|
| 869 |
+
"#features olarak top_keywordsleri belirleyerek metnin bu kelimelerin etrafında olması sağlanmalı \n",
|
| 870 |
+
"def calculate_tfidf(combined_text, stop_words):\n",
|
| 871 |
" vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)\n",
|
| 872 |
+
" tfidf_matrix = vectorizer.fit_transform(combined_text)\n",
|
| 873 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 874 |
" return tfidf_matrix, feature_names\n",
|
| 875 |
"\n",
|
| 876 |
+
"#---------------------------------------------------------------------------------\n",
|
| 877 |
"#kelimelerin ortalama skorlarını hesaplama \n",
|
| 878 |
"def identify_low_tfidf_words(tfidf_matrix, feature_names, threshold=0.001):\n",
|
| 879 |
" # TF-IDF skorlarını toplayarak her kelimenin ortalama skorunu hesaplayın\n",
|
|
|
|
| 881 |
" low_tfidf_words = [feature_names[i] for i, score in enumerate(avg_scores) if score < threshold]\n",
|
| 882 |
" return low_tfidf_words\n",
|
| 883 |
"\n",
|
| 884 |
+
"#kelimelerin yeni geliştirilen eşik değere göre güncellenmesi \n",
|
| 885 |
"def update_stop_words(existing_stop_words, low_tfidf_words):\n",
|
| 886 |
" updated_stop_words = set(existing_stop_words) | set(low_tfidf_words)\n",
|
| 887 |
" return list(updated_stop_words)\n",
|
| 888 |
"\n",
|
| 889 |
"\n",
|
| 890 |
+
"#bu kısım detaylandırılmalı \n",
|
| 891 |
+
"def iterative_update(combined_text, initial_stop_words, iterations=5):\n",
|
| 892 |
" stop_words = set(initial_stop_words)\n",
|
| 893 |
" for _ in range(iterations):\n",
|
| 894 |
+
" tfidf_matrix, feature_names = calculate_tfidf(combined_text, stop_words)\n",
|
| 895 |
" low_tfidf_words = identify_low_tfidf_words(tfidf_matrix, feature_names)\n",
|
| 896 |
" stop_words = update_stop_words(stop_words, low_tfidf_words)\n",
|
| 897 |
" return list(stop_words)\n",
|
| 898 |
+
"\n",
|
| 899 |
"\n",
|
| 900 |
"\n",
|
| 901 |
"def main ():\n",
|
| 902 |
"\n",
|
| 903 |
+
" \n",
|
| 904 |
"#anlam ilişkisini de kontrol edecek bir yapı oluşpturulacak title ile benzerlik kontrol ederek yüksek benzerlik içeren kelimler sıralnacak .\n",
|
| 905 |
"\n",
|
| 906 |
"# Dökümanları liste olarak al\n",
|
| 907 |
" documents_list = [doc.get('text', '') if isinstance(doc, dict) else doc for doc in list(named_documents.values())]\n",
|
| 908 |
"\n",
|
| 909 |
+
" #başlangıç stop değerleriyle yeni olanları arasında değişim yapma \n",
|
| 910 |
+
" initial_stop_words = turkish_stop_words\n",
|
| 911 |
+
" # Stop-words listesini iteratif olarak güncelleyin\n",
|
| 912 |
+
" final_stop_words = iterative_update(documents_list, initial_stop_words)\n",
|
| 913 |
" #tf-ıdf hesaplama\n",
|
| 914 |
+
" tfidf_matrix, feature_names=calculate_tfidf(documents_list,final_stop_words)\n",
|
| 915 |
"\n",
|
| 916 |
+
" \n",
|
|
|
|
| 917 |
"\n",
|
| 918 |
+
" print(\"Güncellenmiş Stop-Words Listesi:\", final_stop_words)\n",
|
| 919 |
+
" print(\"TF-IDF Matrix Shape:\", tfidf_matrix.shape)\n",
|
| 920 |
+
" print(\"Feature Names Sample:\", feature_names[:10]) # İlk 10 feature adını gösterir\n",
|
| 921 |
"\n",
|
| 922 |
+
" return tfidf_matrix, feature_names,documents_list \n",
|
|
|
|
| 923 |
"\n",
|
| 924 |
+
"if __name__==\"__main__\":\n",
|
| 925 |
+
" tfidf_matrix, feature_names,documents_list= main()\n",
|
| 926 |
"\n",
|
| 927 |
"\n",
|
| 928 |
"# Sonuçları yazdır\n",
|
| 929 |
+
"print(\"İsimlendirilmiş Dökümanlar:\")\n",
|
| 930 |
+
"for name, doc in named_documents.items():\n",
|
| 931 |
+
" print(f\"{name}: {doc}\")\n",
|
| 932 |
"\n",
|
| 933 |
" print(\"\\nDökümanlar Listesi:\")\n",
|
| 934 |
" print(documents_list)\n",
|
|
|
|
| 1010 |
},
|
| 1011 |
{
|
| 1012 |
"cell_type": "code",
|
| 1013 |
+
"execution_count": 1,
|
| 1014 |
"metadata": {},
|
| 1015 |
+
"outputs": [
|
| 1016 |
+
{
|
| 1017 |
+
"ename": "NameError",
|
| 1018 |
+
"evalue": "name 'TfidfVectorizer' is not defined",
|
| 1019 |
+
"output_type": "error",
|
| 1020 |
+
"traceback": [
|
| 1021 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 1022 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
| 1023 |
+
"Cell \u001b[1;32mIn[1], line 41\u001b[0m\n\u001b[0;32m 31\u001b[0m turkish_stop_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m([\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124ma\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabide\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabla\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mad\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124madım\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mah\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mama\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124man\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mancak\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maraba\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maralar\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maslında\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[0;32m 33\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maşşağı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbazı\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbelirli\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mben\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbence\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbunu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mburada\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbiz\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbuna\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mçünkü\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mönce\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşu\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msadece\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msana\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mse\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşey\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mşimdi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtabi\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtüm\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mve\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mya da\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myani\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myine\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 38\u001b[0m ])\n\u001b[0;32m 40\u001b[0m \u001b[38;5;66;03m# TF-IDF hesaplayıcı oluşturun ve Türkçe durak kelimelerini dahil edin\u001b[39;00m\n\u001b[1;32m---> 41\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m \u001b[43mTfidfVectorizer\u001b[49m(stop_words\u001b[38;5;241m=\u001b[39mturkish_stop_words)\n\u001b[0;32m 44\u001b[0m \u001b[38;5;124;03m\"\"\"IDF, derlemedeki belge sayısının,\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124;03mincelenen anahtar kelimeyi içeren topluluktaki belge sayısına \u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124;03mbölünmesiyle elde edilen algoritmadır. \u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;124;03mkülliyat yani incelenen tüm belgelerin adedi 10 ise ve test edilen anahtar kelime,\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;124;03mkülliyattaki üç belgede görünüyorsa, bu durumda IDF değeri 0.52’dir (log (10/3)).\"\"\"\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;66;03m#TF-IDF puanı; Naive Bayes ve Destek Vektör Makineleri gibi algoritmalara aktarılabilir. Böylece kelime sayısı gibi daha temel yöntemlerin sonuçları büyük ölçüde iyileştirilebilir.\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\u001b[39;00m\n\u001b[0;32m 53\u001b[0m \u001b[38;5;66;03m#dokuman sayısılarını almakla başlayacağız.\u001b[39;00m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;66;03m# : titlelerın sayısı / terimler ise \u001b[39;00m\n",
|
| 1024 |
+
"\u001b[1;31mNameError\u001b[0m: name 'TfidfVectorizer' is not defined"
|
| 1025 |
+
]
|
| 1026 |
+
}
|
| 1027 |
+
],
|
| 1028 |
"source": [
|
| 1029 |
"\n",
|
| 1030 |
"#---------------------------------------------------------------------------------------------------------------------------------\n",
|