Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/projects/example_01/benchmark/data/corpus.parquet
+++ b/projects/example_01/benchmark/data/corpus.parquet
--- a/projects/example_01/benchmark/data/qa.parquet
+++ b/projects/example_01/benchmark/data/qa.parquet
--- a/projects/src/check_corpus_ids.py
+++ b/projects/src/check_corpus_ids.py
@@ -1,12 +0,0 @@
-import pandas as pd
-
-# corpus.parquet 파일 로드
-corpus_path = "./original/corpus.parquet"
-corpus_data = pd.read_parquet(corpus_path)
-
-# 특정 문서 ID가 존재하는지 확인
-doc_id = "bac7dea6-9477-4290-b57b-861548f7020d"
-print(doc_id in corpus_data['doc_id'].values)  # True면 존재, False면 없음
-
-# corpus_data의 첫 5개 데이터 확인
-print(corpus_data.head())
--- a/projects/src/check_vectordb_corpus.py
+++ b/projects/src/check_vectordb_corpus.py
@@ -1,14 +0,0 @@
-from chromadb import PersistentClient
-
-# ChromaDB 클라이언트 연결
-client = PersistentClient(path="./report_01/chroma")
-
-# 컬렉션 목록 확인
-print(client.list_collections())
-
-# 'document_collection' 컬렉션에서 데이터 조회
-collection = client.get_collection("document_collection")
-
-# 저장된 모든 문서 ID 조회
-stored_docs = collection.get(include=["ids"])
-print("Stored Document IDs:", stored_docs)
--- a/projects/src/check_vectordb_ingestion.py
+++ b/projects/src/check_vectordb_ingestion.py
@@ -1,11 +0,0 @@
-import chromadb
-
-# ChromaDB 연결
-client = chromadb.PersistentClient(path="./report_01/chroma")
-collection = client.get_collection("document_collection")
-
-# 저장된 문서 개수 확인
-print("Stored Document Count:", len(collection.get(include=['ids'])['ids']))
-
-# 일부 문서 ID 확인
-print("Example Document IDs:", collection.get(include=['ids'], limit=5)['ids'])
--- a/projects/src/convert_parquet_to_json.py
+++ b/projects/src/convert_parquet_to_json.py
@@ -1,21 +0,0 @@
-import os
-import pandas as pd
-
-SOURCE_DIR = "/usr/src/app/projects/daesan-dangjin_01"
-TARGET_DIR = "/usr/src/app/projects/daesan-dangjin_01/json"
-os.makedirs(TARGET_DIR, exist_ok=True)
-
-parquet_files = [f for f in os.listdir(SOURCE_DIR) if f.endswith(".parquet")]
-
-
-for file in parquet_files:
-    parquet_path = os.path.join(SOURCE_DIR, file)
-    json_filename = os.path.splitext(file)[0] + ".json"
-    json_path = os.path.join(TARGET_DIR, json_filename)
-    
-    df = pd.read_parquet(parquet_path, engine="pyarrow")
-    df.to_json(json_path, orient="records", force_ascii=False, indent=2)
-
-    print(f"✅ 변환 완료: {json_path}")
-
-print(f"📁 모든 Parquet 파일이 JSON으로 변환되어 {TARGET_DIR}에 저장되었습니다.")