Initial commit

2025-03-14 17:33:18 +09:00
parent ba9c1a4a5f
commit 6814230bfb
61 changed files with 2087124 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -162,7 +162,7 @@ cython_debug/
 .idea/
 .DS_Store
 pytest.ini
-projects
+# projects
 test_projects

 # Visual Studio Code
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "autorag-frontend"]
-	path = autorag-frontend
-	url = https://github.com/Auto-RAG/autorag-frontend.git
--- a/projects/example_01/benchmark_ck0/0/config.yaml
+++ b/projects/example_01/benchmark_ck0/0/config.yaml
@@ -0,0 +1,92 @@
+vectordb:
+  - name: chroma_dragonkue2
+    db_type: chroma
+    client_type: persistent
+    embedding_model: huggingface_drangonku-v2-ko
+    collection_name: huggingface_drangonku-v2-ko
+    path: ${PROJECT_DIR}/resources/chroma
+
+node_lines:
+- node_line_name: retrieve_node_line  # Arbitrary node line name
+  nodes:
+    - node_type: retrieval
+      strategy:
+        metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
+                   retrieval_ndcg, retrieval_map, retrieval_mrr ]
+        speed_threshold: 10
+      top_k: 10
+      modules:
+        - module_type: bm25
+          bm25_tokenizer: [ ko_kiwi, ko_okt ]
+        - module_type: vectordb
+          vectordb: chroma_dragonkue2 # chromadb
+        - module_type: hybrid_cc
+          normalize_method: [ mm, tmm, z, dbsf ]
+          target_modules: ('bm25', 'vectordb')
+          weight_range: (0.6, 0.4)
+          test_weight_size: 101
+    - node_type: passage_reranker # re-ranker
+      strategy:
+        metrics:
+        - retrieval_recall
+        - retrieval_precision
+        - retrieval_map
+      modules:
+      - module_type: dragonkue2
+        top_k: 5
+
+- node_line_name: post_retrieve_node_line # 생성노드
+  nodes:
+  - node_type: prompt_maker
+    strategy:
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+        embedding_model: huggingface_drangonku-v2-ko  # raise ValueError("Only one embedding model is supported")
+        lang: ko
+      generator_modules:
+      - module_type: llama_index_llm
+        llm: ollama
+        model: [ gemma3:12b, phi4, deepseek-r1:14b, aya-expanse:8b ]
+        request_timeout: 3000.0
+    modules:
+      - module_type: fstring
+        prompt:
+        - |
+          ### Task:
+          Respond to the user query using the provided context.
+
+          ### Guidelines:
+          - If you don't know the answer, clearly state that.
+          - If uncertain, ask the user for clarification.
+          - Respond in the same language as the user's query.
+          - If the context is unreadable or of poor quality, inform the user and provide the best possible answer.
+          - If the answer isn't present in the context but you possess the knowledge, explain this to the user and provide the answer using your own understanding.
+          - Do not use XML tags in your response.
+
+          ### Output:
+          Provide a clear and direct response to the user's query.
+
+          <context>
+          {retrieved_contents}
+          </context>
+
+          <user_query>
+          {query}
+          </user_query>
+  - node_type: generator # Gen-LLM
+    strategy: 
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+    modules:
+    - module_type: llama_index_llm
+      llm: ollama
+      model: gemma3:12b  # phi4, deepseek-r1:14b, aya-expanse:8b
+      temperature: 0.0 
+      request_timeout: 30000.0
+      batch: 4
--- a/projects/example_01/benchmark_ck0/1/config.yaml
+++ b/projects/example_01/benchmark_ck0/1/config.yaml
@@ -0,0 +1,92 @@
+vectordb:
+  - name: chroma_dragonkue2
+    db_type: chroma
+    client_type: persistent
+    embedding_model: huggingface_drangonku-v2-ko
+    collection_name: huggingface_drangonku-v2-ko
+    path: ${PROJECT_DIR}/resources/chroma
+
+node_lines:
+- node_line_name: retrieve_node_line  # Arbitrary node line name
+  nodes:
+    - node_type: retrieval
+      strategy:
+        metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
+                   retrieval_ndcg, retrieval_map, retrieval_mrr ]
+        speed_threshold: 10
+      top_k: 10
+      modules:
+        - module_type: bm25
+          bm25_tokenizer: [ ko_kiwi ] # ko_kiwi, ko_okt
+        - module_type: vectordb
+          vectordb: chroma_dragonkue2 # chromadb
+        - module_type: hybrid_cc
+          normalize_method: [ mm, tmm, z, dbsf ]
+          target_modules: ('bm25', 'vectordb')
+          weight_range: (0.6, 0.4)
+          test_weight_size: 101
+    - node_type: passage_reranker # re-ranker
+      strategy:
+        metrics:
+        - retrieval_recall
+        - retrieval_precision
+        - retrieval_map
+      modules:
+      - module_type: dragonkue2
+        top_k: 5
+
+- node_line_name: post_retrieve_node_line # 생성노드
+  nodes:
+  - node_type: prompt_maker
+    strategy:
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+        embedding_model: huggingface_drangonku-v2-ko  # raise ValueError("Only one embedding model is supported")
+        lang: ko
+      generator_modules:
+      - module_type: llama_index_llm
+        llm: ollama
+        model: gemma3:12b
+        request_timeout: 3000.0
+    modules:
+      - module_type: fstring
+        prompt:
+        - |
+          ### 작업:  
+          지침에 따라 제공된 컨텍스트를 활용하여 사용자 질문에 답변하세요.  
+
+          ### 지침:  
+          - 답을 모를 경우, 모른다고 명확히 말하세요.  
+          - 확신이 없다면, 사용자에게 추가 설명을 요청하세요.  
+          - 사용자의 질문과 동일한 언어로 답변하세요.  
+          - 컨텍스트가 읽기 어렵거나 품질이 낮을 경우, 이를 사용자에게 알리고 최선의 답변을 제공하세요.  
+          - 컨텍스트에 답이 없지만 알고 있는 내용이라면, 이를 사용자에게 설명하고 자신의 지식을 바탕으로 답변하세요.  
+          - XML 태그를 사용하지 마세요.  
+
+          ### 출력:  
+          사용자의 질문에 대해 명확하고 직접적인 답변을 제공하세요.
+
+          <context>
+          {retrieved_contents}
+          </context>
+
+          <user_query>
+          {query}
+          </user_query>
+  - node_type: generator # Gen-LLM
+    strategy: 
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+    modules:
+    - module_type: llama_index_llm
+      llm: ollama
+      model: gemma3:12b  # phi4, deepseek-r1:14b, aya-expanse:8b
+      temperature: 0.0 
+      request_timeout: 300.0
+      batch: 8
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/0.parquet
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/0.parquet
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/best_0.parquet
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/best_0.parquet
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/generator/summary.csv
@@ -0,0 +1,2 @@
+filename,module_name,module_params,execution_time,average_output_token,bleu,meteor,rouge,sem_score,is_best
+0.parquet,LlamaIndexLLM,"{'llm': 'ollama', 'model': 'gemma3:12b', 'temperature': 0.0, 'request_timeout': 300.0, 'batch': 8}",0.8519447922706604,259.05,14.57290077698799,0.47984407229799053,0.4400396825396825,0.8177114641079747,True
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/0.parquet
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/0.parquet
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/best_0.parquet
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/best_0.parquet
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/prompt_maker/summary.csv
@@ -0,0 +1,2 @@
+filename,module_name,module_params,execution_time,average_prompt_token,is_best
+0.parquet,Fstring,"{'prompt': '### 작업:  \n지침에 따라 제공된 컨텍스트를 활용하여 사용자 질문에 답변하세요.  \n\n### 지침:  \n- 답을 모를 경우, 모른다고 명확히 말하세요.  \n- 확신이 없다면, 사용자에게 추가 설명을 요청하세요.  \n- 사용자의 질문과 동일한 언어로 답변하세요.  \n- 컨텍스트가 읽기 어렵거나 품질이 낮을 경우, 이를 사용자에게 알리고 최선의 답변을 제공하세요.  \n- 컨텍스트에 답이 없지만 알고 있는 내용이라면, 이를 사용자에게 설명하고 자신의 지식을 바탕으로 답변하세요.  \n- XML 태그를 사용하지 마세요.  \n\n### 출력:  \n사용자의 질문에 대해 명확하고 직접적인 답변을 제공하세요.\n\n<context>\n{retrieved_contents}\n</context>\n\n<user_query>\n{query}\n</user_query>\n'}",0.0003142237663269043,2751.85,True
--- a/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/post_retrieve_node_line/summary.csv
@@ -0,0 +1,3 @@
+node_type,best_module_filename,best_module_name,best_module_params,best_execution_time
+prompt_maker,0.parquet,Fstring,"{'prompt': '### 작업:  \n지침에 따라 제공된 컨텍스트를 활용하여 사용자 질문에 답변하세요.  \n\n### 지침:  \n- 답을 모를 경우, 모른다고 명확히 말하세요.  \n- 확신이 없다면, 사용자에게 추가 설명을 요청하세요.  \n- 사용자의 질문과 동일한 언어로 답변하세요.  \n- 컨텍스트가 읽기 어렵거나 품질이 낮을 경우, 이를 사용자에게 알리고 최선의 답변을 제공하세요.  \n- 컨텍스트에 답이 없지만 알고 있는 내용이라면, 이를 사용자에게 설명하고 자신의 지식을 바탕으로 답변하세요.  \n- XML 태그를 사용하지 마세요.  \n\n### 출력:  \n사용자의 질문에 대해 명확하고 직접적인 답변을 제공하세요.\n\n<context>\n{retrieved_contents}\n</context>\n\n<user_query>\n{query}\n</user_query>\n'}",0.0003142237663269
+generator,0.parquet,LlamaIndexLLM,"{'llm': 'ollama', 'model': 'gemma3:12b', 'temperature': 0.0, 'request_timeout': 300.0, 'batch': 8}",0.8519447922706604
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/0.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/0.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/best_0.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/best_0.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/passage_reranker/summary.csv
@@ -0,0 +1,2 @@
+filename,module_name,module_params,execution_time,passage_reranker_retrieval_recall,passage_reranker_retrieval_precision,passage_reranker_retrieval_map,is_best
+0.parquet,DragonKue2,{'top_k': 5},0.12188564538955689,0.3,0.06,0.18916666666666665,True
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/0.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/0.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/1.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/1.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/2.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/2.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/3.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/3.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/4.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/4.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/5.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/5.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/best_2.parquet
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/best_2.parquet
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/retrieval/summary.csv
@@ -0,0 +1,7 @@
+filename,module_name,module_params,execution_time,retrieval_f1,retrieval_recall,retrieval_precision,retrieval_ndcg,retrieval_map,retrieval_mrr,is_best
+0.parquet,VectorDB,"{'top_k': 10, 'vectordb': 'chroma_dragonkue2'}",0.10161013603210449,0.045454545454545456,0.25,0.025,0.14013009087326042,0.10625,0.10625,False
+1.parquet,BM25,"{'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'}",1.9859044432640076,0.03636363636363636,0.2,0.02,0.07248116240107563,0.034999999999999996,0.034999999999999996,False
+2.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'dbsf', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.516, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112,0.06363636363636363,0.35,0.035,0.20447427813233116,0.16041666666666665,0.16041666666666665,True
+3.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'mm', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.51, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112,0.06363636363636363,0.35,0.035,0.20447427813233116,0.16041666666666665,0.16041666666666665,False
+4.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'tmm', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.454, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112,0.05454545454545454,0.3,0.03,0.15007396002669662,0.10499999999999998,0.10499999999999998,False
+5.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'z', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.516, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112,0.06363636363636363,0.35,0.035,0.20447427813233116,0.16041666666666665,0.16041666666666665,False
--- a/projects/example_01/benchmark_ck0/1/retrieve_node_line/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/retrieve_node_line/summary.csv
@@ -0,0 +1,3 @@
+node_type,best_module_filename,best_module_name,best_module_params,best_execution_time
+retrieval,2.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'dbsf', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.516, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112
+passage_reranker,0.parquet,DragonKue2,{'top_k': 5},0.1218856453895568
--- a/projects/example_01/benchmark_ck0/1/summary.csv
+++ b/projects/example_01/benchmark_ck0/1/summary.csv
@@ -0,0 +1,5 @@
+node_line_name,node_type,best_module_filename,best_module_name,best_module_params,best_execution_time
+retrieve_node_line,retrieval,2.parquet,HybridCC,"{'top_k': 10, 'normalize_method': 'dbsf', 'target_modules': ('VectorDB', 'BM25'), 'weight': 0.516, 'target_module_params': ({'top_k': 10, 'vectordb': 'chroma_dragonkue2'}, {'top_k': 10, 'bm25_tokenizer': 'ko_kiwi'})}",2.087514579296112
+retrieve_node_line,passage_reranker,0.parquet,DragonKue2,{'top_k': 5},0.1218856453895568
+post_retrieve_node_line,prompt_maker,0.parquet,Fstring,"{'prompt': '### 작업:  \n지침에 따라 제공된 컨텍스트를 활용하여 사용자 질문에 답변하세요.  \n\n### 지침:  \n- 답을 모를 경우, 모른다고 명확히 말하세요.  \n- 확신이 없다면, 사용자에게 추가 설명을 요청하세요.  \n- 사용자의 질문과 동일한 언어로 답변하세요.  \n- 컨텍스트가 읽기 어렵거나 품질이 낮을 경우, 이를 사용자에게 알리고 최선의 답변을 제공하세요.  \n- 컨텍스트에 답이 없지만 알고 있는 내용이라면, 이를 사용자에게 설명하고 자신의 지식을 바탕으로 답변하세요.  \n- XML 태그를 사용하지 마세요.  \n\n### 출력:  \n사용자의 질문에 대해 명확하고 직접적인 답변을 제공하세요.\n\n<context>\n{retrieved_contents}\n</context>\n\n<user_query>\n{query}\n</user_query>\n'}",0.0003142237663269
+post_retrieve_node_line,generator,0.parquet,LlamaIndexLLM,"{'llm': 'ollama', 'model': 'gemma3:12b', 'temperature': 0.0, 'request_timeout': 300.0, 'batch': 8}",0.8519447922706604
--- a/projects/example_01/benchmark_ck0/data/corpus.parquet
+++ b/projects/example_01/benchmark_ck0/data/corpus.parquet
--- a/projects/example_01/benchmark_ck0/data/qa.parquet
+++ b/projects/example_01/benchmark_ck0/data/qa.parquet
--- a/projects/example_01/benchmark_ck0/resources/bm25_ko_kiwi.pkl
+++ b/projects/example_01/benchmark_ck0/resources/bm25_ko_kiwi.pkl
--- a/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/data_level0.bin
+++ b/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/data_level0.bin
--- a/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/header.bin
+++ b/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/header.bin
--- a/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/index_metadata.pickle
+++ b/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/index_metadata.pickle
--- a/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/length.bin
+++ b/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/length.bin
--- a/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/link_lists.bin
+++ b/projects/example_01/benchmark_ck0/resources/chroma/985a388c-103b-4534-a1a5-d7088ed74c0c/link_lists.bin
--- a/projects/example_01/benchmark_ck0/resources/chroma/chroma.sqlite3
+++ b/projects/example_01/benchmark_ck0/resources/chroma/chroma.sqlite3
--- a/projects/example_01/benchmark_ck0/resources/vectordb.yaml
+++ b/projects/example_01/benchmark_ck0/resources/vectordb.yaml
@@ -0,0 +1,7 @@
+vectordb:
+- client_type: persistent
+  collection_name: huggingface_drangonku-v2-ko
+  db_type: chroma
+  embedding_model: huggingface_drangonku-v2-ko
+  name: chroma_dragonkue2
+  path: ../projects/daesan-dangjin_01/benchmark/resources/chroma
--- a/projects/example_01/benchmark_ck0/trial.json
+++ b/projects/example_01/benchmark_ck0/trial.json
@@ -0,0 +1,10 @@
+[
+    {
+        "trial_name": "0",
+        "start_time": "2025-03-13 07:47:00"
+    },
+    {
+        "trial_name": "1",
+        "start_time": "2025-03-13 08:03:47"
+    }
+]
--- a/projects/example_01/chunk/0.parquet
+++ b/projects/example_01/chunk/0.parquet
--- a/projects/example_01/chunk/1.parquet
+++ b/projects/example_01/chunk/1.parquet
--- a/projects/example_01/chunk/2.parquet
+++ b/projects/example_01/chunk/2.parquet
--- a/projects/example_01/chunk/3.parquet
+++ b/projects/example_01/chunk/3.parquet
--- a/projects/example_01/chunk/4.parquet
+++ b/projects/example_01/chunk/4.parquet
--- a/projects/example_01/chunk/5.parquet
+++ b/projects/example_01/chunk/5.parquet
--- a/projects/example_01/chunk/6.parquet
+++ b/projects/example_01/chunk/6.parquet
--- a/projects/example_01/chunk/chunk_config.yaml
+++ b/projects/example_01/chunk/chunk_config.yaml
@@ -0,0 +1,13 @@
+modules:
+  - module_type: llama_index_chunk
+    chunk_method: Token
+    chunk_size: [200, 1000]
+    chunk_overlap: [30, 200]
+    add_file_name: ko 
+  - module_type: llama_index_chunk
+    chunk_method: Semantic_llama_index
+    embed_model: hf_snowflake-arctic-embed-l-v2.0-ko
+    add_file_name: ko 
+  - module_type: langchain_chunk
+    chunk_method: recursivecharacter
+    separators: [ " ", "\n" ]
--- a/projects/example_01/chunk/summary.csv
+++ b/projects/example_01/chunk/summary.csv
@@ -0,0 +1,8 @@
+filename,module_name,module_params,execution_time
+0.parquet,llama_index_chunk,"{'chunk_method': 'Token', 'chunk_size': 200, 'chunk_overlap': 200, 'add_file_name': 'ko'}",9.001352617045057e-05
+1.parquet,llama_index_chunk,"{'chunk_method': 'Token', 'chunk_size': 200, 'chunk_overlap': 30, 'add_file_name': 'ko'}",3.7807608560139556e-05
+2.parquet,llama_index_chunk,"{'chunk_method': 'Token', 'chunk_size': 1000, 'chunk_overlap': 200, 'add_file_name': 'ko'}",2.4567047525651943e-05
+3.parquet,llama_index_chunk,"{'chunk_method': 'Token', 'chunk_size': 1000, 'chunk_overlap': 30, 'add_file_name': 'ko'}",2.697287288367227e-05
+4.parquet,llama_index_chunk,"{'chunk_method': 'Semantic_llama_index', 'embed_model': 'hf_snowflake-arctic-embed-l-v2.0-ko', 'add_file_name': 'ko'}",0.006124294066363658
+5.parquet,langchain_chunk,"{'chunk_method': 'recursivecharacter', 'separators': ' '}",1.7906897601052548e-06
+6.parquet,langchain_chunk,"{'chunk_method': 'recursivecharacter', 'separators': '\n'}",2.2023862824233143e-06
--- a/projects/example_01/config.yaml
+++ b/projects/example_01/config.yaml
@@ -0,0 +1,92 @@
+vectordb:
+  - name: chroma_dragonkue2
+    db_type: chroma
+    client_type: persistent
+    embedding_model: huggingface_drangonku-v2-ko
+    collection_name: huggingface_drangonku-v2-ko
+    path: ${PROJECT_DIR}/resources/chroma
+
+node_lines:
+- node_line_name: retrieve_node_line  # Arbitrary node line name
+  nodes:
+    - node_type: retrieval
+      strategy:
+        metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
+                   retrieval_ndcg, retrieval_map, retrieval_mrr ]
+        speed_threshold: 10
+      top_k: 10
+      modules:
+        - module_type: bm25
+          bm25_tokenizer: [ ko_kiwi ] # ko_kiwi, ko_okt
+        - module_type: vectordb
+          vectordb: chroma_dragonkue2 # chromadb
+        - module_type: hybrid_cc
+          normalize_method: [ mm, tmm, z, dbsf ]
+          target_modules: ('bm25', 'vectordb')
+          weight_range: (0.6, 0.4)
+          test_weight_size: 101
+    - node_type: passage_reranker # re-ranker
+      strategy:
+        metrics:
+        - retrieval_recall
+        - retrieval_precision
+        - retrieval_map
+      modules:
+      - module_type: dragonkue2
+        top_k: 5
+
+- node_line_name: post_retrieve_node_line # 생성노드
+  nodes:
+  - node_type: prompt_maker
+    strategy:
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+        embedding_model: huggingface_drangonku-v2-ko  # raise ValueError("Only one embedding model is supported")
+        lang: ko
+      generator_modules:
+      - module_type: llama_index_llm
+        llm: ollama
+        model: gemma3:12b
+        request_timeout: 3000.0
+    modules:
+      - module_type: fstring
+        prompt:
+        - |
+          ### 작업:  
+          지침에 따라 제공된 컨텍스트를 활용하여 사용자 질문에 답변하세요.  
+
+          ### 지침:  
+          - 답을 모를 경우, 모른다고 명확히 말하세요.  
+          - 확신이 없다면, 사용자에게 추가 설명을 요청하세요.  
+          - 사용자의 질문과 동일한 언어로 답변하세요.  
+          - 컨텍스트가 읽기 어렵거나 품질이 낮을 경우, 이를 사용자에게 알리고 최선의 답변을 제공하세요.  
+          - 컨텍스트에 답이 없지만 알고 있는 내용이라면, 이를 사용자에게 설명하고 자신의 지식을 바탕으로 답변하세요.  
+          - XML 태그를 사용하지 마세요.  
+
+          ### 출력:  
+          사용자의 질문에 대해 명확하고 직접적인 답변을 제공하세요.
+
+          <context>
+          {retrieved_contents}
+          </context>
+
+          <user_query>
+          {query}
+          </user_query>
+  - node_type: generator # Gen-LLM
+    strategy: 
+      metrics:
+      - metric_name: bleu
+      - metric_name: meteor
+      - metric_name: rouge
+      - metric_name: sem_score
+    modules:
+    - module_type: llama_index_llm
+      llm: ollama
+      model: gemma3:12b  # phi4, deepseek-r1:14b, aya-expanse:8b
+      temperature: 0.0 
+      request_timeout: 300.0
+      batch: 8
--- a/projects/example_01/config/chunk.yaml
+++ b/projects/example_01/config/chunk.yaml
@@ -0,0 +1,13 @@
+modules:
+  - module_type: llama_index_chunk
+    chunk_method: Token
+    chunk_size: [200, 800]
+    chunk_overlap: [30, 200]
+    add_file_name: ko 
+  - module_type: llama_index_chunk
+    chunk_method: Semantic_llama_index
+    embed_model: hf_snowflake-arctic-embed-l-v2.0-ko
+    add_file_name: ko 
+  - module_type: langchain_chunk
+    chunk_method: recursivecharacter
+    separators: [ " ", "\n" ]
--- a/projects/example_01/config/parse.yaml
+++ b/projects/example_01/config/parse.yaml
@@ -0,0 +1,4 @@
+modules:
+  - module_type: langchain_parse
+    file_type: pdf
+    parse_method: [ pdfminer, pdfplumber, pypdfium2, pypdf, pymupdf ]
--- a/projects/example_01/corpus.parquet
+++ b/projects/example_01/corpus.parquet
--- a/projects/example_01/json/corpus.json
+++ b/projects/example_01/json/corpus.json
--- a/projects/example_01/json/qa.json
+++ b/projects/example_01/json/qa.json
@@ -0,0 +1,262 @@
+[
+  {
+    "qid":"a388f189-31f3-4ca5-9394-3516d3f8c8b9",
+    "query":"국토지리정보원은 어떤 기준을 사용하여 국토공간에 대한 위치기반 조성을 하고 있습니까?",
+    "retrieval_gt":[
+      [
+        "c21a5c9a-a5e5-4273-9cda-74c09568b6e3"
+      ]
+    ],
+    "generation_gt":[
+      "국토지리정보원은 동경기준을 사용하여 국토공간에 대한 위치기반 조성을 하고 있습니다.",
+      "동경기준"
+    ]
+  },
+  {
+    "qid":"a45e3cfb-d024-4572-83b8-2ce1546234a1",
+    "query":"흐름 경로 길이 L이 0.4km 이하인 도시유역의 면적은 얼마 이하입니까?",
+    "retrieval_gt":[
+      [
+        "357ff48c-f44a-48f1-9995-b25c16da239f"
+      ]
+    ],
+    "generation_gt":[
+      "흐름 경로 길이 L이 0.4km 이하인 도시유역의 면적은 0.04km² 이하입니다.",
+      "0.04km² 이하"
+    ]
+  },
+  {
+    "qid":"0466c37b-6a1c-44fd-9b25-649cecbd9f53",
+    "query":"대산1터널의 시점부와 종점부의 길이는 각각 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "d431ca69-6ca1-4b30-9a8d-39ac4f79a65b"
+      ]
+    ],
+    "generation_gt":[
+      "대산1터널의 시점부 길이는 138m이고, 종점부 길이는 95m입니다.",
+      "대산1터널 시점부: 138m, 종점부: 95m"
+    ]
+  },
+  {
+    "qid":"cbb1274e-2fb6-4c0c-b91c-ad4e35450472",
+    "query":"농도317호선의 이설계획 수립 길이는 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "60ccec23-f61b-4a3c-b01d-d1480b3b172d"
+      ]
+    ],
+    "generation_gt":[
+      "농도317호선의 이설계획 수립 길이는 135.0m입니다.",
+      "135.0m"
+    ]
+  },
+  {
+    "qid":"7fa9a0d2-3a28-46fd-b05e-ddd8816488fe",
+    "query":"토공 계획시 대깎기부 발생에 따라 어떤 방법으로 자연훼손을 최소화하였습니까?",
+    "retrieval_gt":[
+      [
+        "04ee566f-014f-46ed-8b27-1d6798e682c8"
+      ]
+    ],
+    "generation_gt":[
+      "토공 계획시 대깎기부 발생에 따라 터널로 계획하여 자연훼손 및 발생예상 민원을 최소화하였습니다.",
+      "터널로 계획하여 자연훼손 및 발생예상 민원을 최소화하였습니다."
+    ]
+  },
+  {
+    "qid":"53c561c4-abf9-4458-b415-a134068cc0f9",
+    "query":"당진방향과 대산방향의 구분에서 각각의 개소 연장은 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "60bdf1d5-c363-41d0-9bb0-97ba7276abe5"
+      ]
+    ],
+    "generation_gt":[
+      "당진방향의 개소 연장은 17,842.01m이고, 대산방향의 개소 연장은 18,070.00m입니다.",
+      "당진방향: 17,273.32 m, 대산방향: 18,070.00 m"
+    ]
+  },
+  {
+    "qid":"0aa5d2d7-106c-4503-aa76-6dc195243572",
+    "query":"비탈면보호 가시설 덮개의 설치구간은 어디입니까?",
+    "retrieval_gt":[
+      [
+        "eecbe2e3-56d8-49d7-934c-d29bb03b19ec"
+      ]
+    ],
+    "generation_gt":[
+      "비탈면보호 가시설 덮개의 설치구간은 1단이상 고성토부 (H> 6.0m)의 비탈면 전구간입니다.",
+      "1단이상 고성토부 (H>6.0m)의 비탈면 전구간"
+    ]
+  },
+  {
+    "qid":"1f68bd5a-bcb3-4008-b437-b525f3ff6da5",
+    "query":"용지조사는 어떤 법적 근거를 바탕으로 수행되었습니까?",
+    "retrieval_gt":[
+      [
+        "c933f8c6-a985-482c-9681-457813aaeb1c"
+      ]
+    ],
+    "generation_gt":[
+      "용지조사는 법적 근거인 지적도, 토지대장, 등기부등본, 가옥대장 등을 바탕으로 수행되었습니다.",
+      "지적도, 토지대장, 등기부등본, 가옥대장"
+    ]
+  },
+  {
+    "qid":"4dbd7967-acc5-41d3-b0a8-192fd0064ab0",
+    "query":"대호지통과구간에 설치된 방호벽의 높이는 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "f650e3a1-a64c-4968-aad1-3f4ba79b73e6"
+      ]
+    ],
+    "generation_gt":[
+      "대호지통과구간에 설치된 방호벽의 높이는 1.0m입니다.",
+      "1.0m"
+    ]
+  },
+  {
+    "qid":"44cf2b37-0742-4335-b9a2-122887a08314",
+    "query":"천의리 171-3대의 가옥 뒤편에 어떤 부지를 확인하여 검토하겠다고 언급되었습니까?",
+    "retrieval_gt":[
+      [
+        "0294cbb6-8218-4a64-b7c9-8175ba5ee7ae"
+      ]
+    ],
+    "generation_gt":[
+      "천의리 171-3대의 가옥 뒤편에 임야부지를 확인하여 진출입이 가능한지 검토하겠다고 언급되었습니다.",
+      "임야부지"
+    ]
+  },
+  {
+    "qid":"56afbda5-8a91-4441-9f34-dd37b5ac7ea4",
+    "query":"포장공의 요철보정층의 부피는 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "868c77cf-58a1-465b-80c4-7be269dcaf0c"
+      ]
+    ],
+    "generation_gt":[
+      "포장공의 요철보정층의 부피는 10,479 m³입니다.",
+      "10,479 m³"
+    ]
+  },
+  {
+    "qid":"14cc8787-18ea-45b8-baf6-58632a695dee",
+    "query":"직선부의 길이는 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "b48a3337-d2d8-44a5-9b96-1ac11a8e5e3b"
+      ]
+    ],
+    "generation_gt":[
+      "직선부의 길이는 4,110 m입니다.",
+      "1,483 m"
+    ]
+  },
+  {
+    "qid":"d62a7714-2a59-4d19-8df9-9d4ed5c988bb",
+    "query":"동물유도휀스의 설계기준 중 방형울타리가 유지관리 용이한 지역은 어디입니까?",
+    "retrieval_gt":[
+      [
+        "da9c74a9-6076-4406-a155-95fd6951232e"
+      ]
+    ],
+    "generation_gt":[
+      "동물유도휀스의 설계기준 중 방형울타리가 유지관리 용이한 지역은 시권 지역으로 주행경관 및 유지관리 접근성을 고려한 평지, 높이 15m 이하 절토부 및 기타 관리 용이 지역입니다.",
+      "시권 지역, 평지, 높이 15m 이하 절토부, 기타 관리 용이 지역"
+    ]
+  },
+  {
+    "qid":"7984c53a-42db-438e-b2c5-3c1526274dd2",
+    "query":"표지판은 어떤 두 가지 목적에 따라 구분됩니까?",
+    "retrieval_gt":[
+      [
+        "2aeca049-a3a0-42b0-8b27-3285d4b50611"
+      ]
+    ],
+    "generation_gt":[
+      "표지판은 도로상의 안전통행을 위한 교통안전표지와 도로정보의 전달을 목적으로 하는 도로표지로 구분됩니다.",
+      "교통안전표지, 도로표지"
+    ]
+  },
+  {
+    "qid":"2addd3bc-a294-49e4-9cd0-ae8155497cf7",
+    "query":"안전관리계획서 검토비는 얼마입니까?",
+    "retrieval_gt":[
+      [
+        "24591b96-9edb-43d8-a6cf-2e2b3b40d8a1"
+      ]
+    ],
+    "generation_gt":[
+      "안전관리계획서 검토비는 0.00입니다.",
+      "안전관리계획서 검토비는 명시되어 있지 않습니다."
+    ]
+  },
+  {
+    "qid":"5be5ff8f-55ad-482d-8be0-f70e06a20a37",
+    "query":"염수분사시설에 대한 내용은 몇 페이지에 걸쳐 있습니까?",
+    "retrieval_gt":[
+      [
+        "2a4bac5e-988d-4b6c-9da3-93b9dc6ca1bf"
+      ]
+    ],
+    "generation_gt":[
+      "염수분사시설에 대한 내용은 4-176 페이지에 있습니다.",
+      "4-176 페이지"
+    ]
+  },
+  {
+    "qid":"8763d9a2-7a2a-49e6-ac0c-8cda830287b5",
+    "query":"주어진 내용에서 숫자 6과 7이 포함된 항목은 무엇입니까?",
+    "retrieval_gt":[
+      [
+        "a6c51478-9b99-41c0-9016-1805732421de"
+      ]
+    ],
+    "generation_gt":[
+      "주어진 내용에서 숫자 6과 7이 포함된 항목은 다음과 같습니다:\n\n- 6\n- 7\n- 6\n- 7\n- 6\n- 5\n- 6\n- 5\n- 7\n- 7\n- 6\n- 5\n- 7\n- 3\n- 6\n- 3\n- 7\n- 3\n- 0\n- 1",
+      "6, 7, 6, 7, 6, 5, 6, 5, 7, 7, 6, 5, 7, 9, 6, 3, 7, 3, 0, 1"
+    ]
+  },
+  {
+    "qid":"e4305a4c-11f1-4f9e-8c51-09f9cea9798e",
+    "query":"노선 변경이 없을 것으로 판단되는 특별한 사유는 무엇입니까?",
+    "retrieval_gt":[
+      [
+        "c89553ac-a5a8-4a2e-a096-bc6570398a27"
+      ]
+    ],
+    "generation_gt":[
+      "특별한 사유가 없는 한 노선 변경은 없을 것으로 판단된다고 명시되어 있습니다. 따라서 특별한 사유에 대한 구체적인 내용은 제공되지 않고 있습니다.",
+      "특별한 사유가 없는 한 노선 변경은 없을 것으로 판단됨."
+    ]
+  },
+  {
+    "qid":"3a9d1296-844c-4012-ad1a-ac96ac4041cd",
+    "query":"성남시의 국도 42호선은 몇 차로로 구성되어 있습니까?",
+    "retrieval_gt":[
+      [
+        "c8b67f05-72f3-4445-a344-5d56d93114b4"
+      ]
+    ],
+    "generation_gt":[
+      "성남시의 국도 42호선은 4차로로 구성되어 있습니다.",
+      "4차로"
+    ]
+  },
+  {
+    "qid":"8f291e67-8f6a-4d4c-a4cd-6554e4cfd722",
+    "query":"주어진 내용에서 숫자 12가 몇 번 등장합니까?",
+    "retrieval_gt":[
+      [
+        "17588b98-93bc-42f5-8cfa-2e520c7482ed"
+      ]
+    ],
+    "generation_gt":[
+      "주어진 내용에서 숫자 12는 3번 등장합니다.",
+      "2번 등장."
+    ]
+  }
+]
--- a/projects/example_01/parse/parse_config.yaml
+++ b/projects/example_01/parse/parse_config.yaml
@@ -0,0 +1,4 @@
+modules:
+  - module_type: langchain_parse
+    file_type: pdf
+    parse_method: [ pdfminer, pdfplumber, pypdfium2, pypdf, pymupdf ]
--- a/projects/example_01/parse/parsed_result.parquet
+++ b/projects/example_01/parse/parsed_result.parquet
--- a/projects/example_01/parse/pdf.parquet
+++ b/projects/example_01/parse/pdf.parquet
--- a/projects/example_01/parse/summary.csv
+++ b/projects/example_01/parse/summary.csv
@@ -0,0 +1,6 @@
+filename,module_name,module_params,execution_time
+pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pymupdf'}",0.015248891783923638
+pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pypdf'}",0.15360368810048916
+pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pdfplumber'}",0.42682165052832627
+pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pdfminer'}",0.44084878549343204
+pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pypdfium2'}",0.008509700472761944
--- a/projects/example_01/qa.parquet
+++ b/projects/example_01/qa.parquet
--- a/projects/example_01/raw_data/00.종합보고서
+++ b/projects/example_01/raw_data/00.종합보고서
--- a/projects/example_01/raw_data/01.제1장.
+++ b/projects/example_01/raw_data/01.제1장.
--- a/projects/src/check_corpus_ids.py
+++ b/projects/src/check_corpus_ids.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+# corpus.parquet 파일 로드
+corpus_path = "./original/corpus.parquet"
+corpus_data = pd.read_parquet(corpus_path)
+
+# 특정 문서 ID가 존재하는지 확인
+doc_id = "bac7dea6-9477-4290-b57b-861548f7020d"
+print(doc_id in corpus_data['doc_id'].values)  # True면 존재, False면 없음
+
+# corpus_data의 첫 5개 데이터 확인
+print(corpus_data.head())
--- a/projects/src/check_vectordb_corpus.py
+++ b/projects/src/check_vectordb_corpus.py
@@ -0,0 +1,14 @@
+from chromadb import PersistentClient
+
+# ChromaDB 클라이언트 연결
+client = PersistentClient(path="./report_01/chroma")
+
+# 컬렉션 목록 확인
+print(client.list_collections())
+
+# 'document_collection' 컬렉션에서 데이터 조회
+collection = client.get_collection("document_collection")
+
+# 저장된 모든 문서 ID 조회
+stored_docs = collection.get(include=["ids"])
+print("Stored Document IDs:", stored_docs)
--- a/projects/src/check_vectordb_ingestion.py
+++ b/projects/src/check_vectordb_ingestion.py
@@ -0,0 +1,11 @@
+import chromadb
+
+# ChromaDB 연결
+client = chromadb.PersistentClient(path="./report_01/chroma")
+collection = client.get_collection("document_collection")
+
+# 저장된 문서 개수 확인
+print("Stored Document Count:", len(collection.get(include=['ids'])['ids']))
+
+# 일부 문서 ID 확인
+print("Example Document IDs:", collection.get(include=['ids'], limit=5)['ids'])
--- a/projects/src/convert_parquet_to_json.py
+++ b/projects/src/convert_parquet_to_json.py
@@ -0,0 +1,21 @@
+import os
+import pandas as pd
+
+SOURCE_DIR = "/usr/src/app/projects/daesan-dangjin_01"
+TARGET_DIR = "/usr/src/app/projects/daesan-dangjin_01/json"
+os.makedirs(TARGET_DIR, exist_ok=True)
+
+parquet_files = [f for f in os.listdir(SOURCE_DIR) if f.endswith(".parquet")]
+
+
+for file in parquet_files:
+    parquet_path = os.path.join(SOURCE_DIR, file)
+    json_filename = os.path.splitext(file)[0] + ".json"
+    json_path = os.path.join(TARGET_DIR, json_filename)
+    
+    df = pd.read_parquet(parquet_path, engine="pyarrow")
+    df.to_json(json_path, orient="records", force_ascii=False, indent=2)
+
+    print(f"✅ 변환 완료: {json_path}")
+
+print(f"📁 모든 Parquet 파일이 JSON으로 변환되어 {TARGET_DIR}에 저장되었습니다.")