first commit

2025-03-27 16:11:09 +09:00
parent bd308ea2df
commit 1d1d4e62b2
9 changed files with 300 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+es
+split_KCS
--- a/README.md
+++ b/README.md
@@ -8,8 +8,6 @@

 ### 설치

-#### 도커
-
 ```text
 docker compose build  
 docker compose up
@@ -17,10 +15,8 @@ docker compose up

 ## 사용하기

-### API
+### query.py

 ```text
-query.py
 query = {"query": {"match": {"code": "101015"}}} 수정 후 질의
 ```
-
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,15 @@
+version: '3.8'
+
+services:
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.4
+    container_name: myes
+    environment:
+      - discovery.type=single-node
+    ports:
+      - "9200:9200"  
+      - "9300:9300"
+    # volumes:
+    #   - esdata:/usr/share/elasticsearch/data 
+# volumes:
+#  esdata:
--- a/7
+++ b/7
@@ -0,0 +1,7 @@
+FROM docker.elastic.co/elasticsearch/elasticsearch:7.17.4
+
+ENV discovery.type=single-node
+
+EXPOSE 9200 9300
+
+CMD ["elasticsearch"]
--- a/dumy_insert.py
+++ b/dumy_insert.py
@@ -0,0 +1,123 @@
+import csv
+
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+
+# --- 설정 ---
+ELASTICSEARCH_HOSTS = ["http://localhost:9200"]  # Elasticsearch 주소 (필요시 수정)
+INDEX_NAME = "my-user-index"  # 사용할 인덱스 이름
+CSV_FILE_PATH = "dummy_data2.csv"  # CSV 파일 경로
+# --- 설정 끝 ---
+
+# Elasticsearch 클라이언트 생성
+try:
+    es = Elasticsearch(ELASTICSEARCH_HOSTS)
+    # 연결 테스트 (선택 사항)
+    if not es.ping():
+        raise ValueError("Elasticsearch 연결 실패!")
+    print("Elasticsearch 연결 성공!")
+except Exception as e:
+    print(f"Elasticsearch 연결 중 오류 발생: {e}")
+    exit()
+
+
+def generate_actions(csv_filepath, index_name):
+    """CSV 파일을 읽어 Elasticsearch bulk API 액션을 생성합니다."""
+    try:
+        with open(csv_filepath, "r", encoding="utf-8") as f:
+            reader = csv.reader(f)
+            header = next(reader)  # 헤더 읽기
+
+            for row in reader:
+                if not row:  # 빈 줄 건너뛰기
+                    continue
+
+                doc = {}
+                try:
+                    # 헤더와 데이터 매핑
+                    doc = dict(zip(header, row))
+
+                    # 데이터 타입 변환 (age를 정수로)
+                    if "age" in doc and doc["age"]:  # 값이 있을 때만 변환 시도
+                        try:
+                            doc["age"] = int(doc["age"])
+                        except ValueError:
+                            print(
+                                f"경고: 'age' 필드 정수 변환 실패 (값: {doc['age']}). 행: {row}"
+                            )
+                            # age 변환 실패 시 처리: None으로 설정하거나, 로그만 남기고 진행하거나 선택
+                            doc["age"] = None  # 예시: None으로 설정
+                    elif "age" in doc and not doc["age"]:
+                        doc["age"] = None  # 빈 문자열이면 None으로 설정
+
+                    # created_at은 Elasticsearch가 자동으로 date 타입으로 인식할 수 있음
+                    # (인덱스 매핑 설정에 따라 다를 수 있음)
+
+                    # Bulk API 형식에 맞게 생성
+                    yield {
+                        "_index": index_name,
+                        "_id": doc.get("id"),  # CSV의 id를 Elasticsearch 문서 ID로 사용
+                        "_source": doc,
+                    }
+                except ValueError as ve:
+                    print(f"데이터 변환 오류 (행 건너뜀): {row} - {ve}")
+                    continue
+                except Exception as ex:
+                    print(f"행 처리 중 오류 발생 (행 건너뜀): {row} - {ex}")
+                    continue
+
+    except FileNotFoundError:
+        print(f"오류: CSV 파일 '{csv_filepath}'를 찾을 수 없습니다.")
+        exit()
+    except Exception as e:
+        print(f"CSV 파일 읽기 중 오류 발생: {e}")
+        exit()
+
+
+# --- 실행 ---
+print(
+    f"'{CSV_FILE_PATH}' 파일에서 데이터를 읽어 '{INDEX_NAME}' 인덱스로 벌크 삽입을 시작합니다..."
+)
+
+try:
+    # bulk 함수를 사용하여 데이터 삽입
+    success, failed_items = bulk(
+        es,
+        generate_actions(CSV_FILE_PATH, INDEX_NAME),
+        chunk_size=500,
+        request_timeout=60,
+        raise_on_error=False,
+        raise_on_exception=False,
+    )  # 에러 발생 시 중단하지 않도록 설정 추가
+
+    # 실패 건수 계산 (failed_items 리스트의 길이)
+    num_failed = len(failed_items)
+
+    print(f"벌크 삽입 완료: 성공={success}, 실패={num_failed}")
+
+    # *** 수정된 부분 ***
+    # failed_items 리스트가 비어있지 않은지 확인 (즉, 실패한 항목이 있는지 확인)
+    if failed_items:  # 또는 if num_failed > 0:
+        print(
+            f"실패한 항목 {num_failed}개가 있습니다. 상세 내용은 아래와 같습니다 (일부만 표시될 수 있음):"
+        )
+        # 실패 상세 내용 출력 (예: 처음 5개만)
+        for i, item in enumerate(failed_items):
+            if i < 5:
+                print(f"  - {item}")
+            else:
+                print(f"  ... (총 {num_failed}개 중 {i}개 표시)")
+                break
+        # 필요하다면 모든 실패 항목을 로그 파일 등에 기록할 수 있습니다.
+    else:
+        # 인덱스 새로고침 (데이터가 검색 가능하도록) - 실패가 없을 때만 수행하거나 항상 수행하도록 선택 가능
+        try:
+            es.indices.refresh(index=INDEX_NAME)
+            print(f"인덱스 '{INDEX_NAME}' 새로고침 완료.")
+        except Exception as refresh_err:
+            print(f"인덱스 새로고침 중 오류 발생: {refresh_err}")
+
+
+except Exception as e:
+    # bulk 함수 자체에서 예외가 발생한 경우 (예: 연결 문제)
+    print(f"벌크 삽입 중 예상치 못한 오류 발생: {e}")
--- a/kcs_insert.py
+++ b/kcs_insert.py
@@ -0,0 +1,40 @@
+import csv
+import os
+
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+
+# Elasticsearch 클라이언트 생성
+es = Elasticsearch("http://localhost:9200")
+
+# CSV 파일이 있는 폴더 경로 설정
+csv_folder = r"split_KCS"  # CSV 파일들이 들어 있는 폴더
+
+
+# Elasticsearch에 데이터 삽입 함수
+def index_csv_files(folder_path, index_name):
+    docs = []  # bulk 삽입을 위한 리스트
+
+    # 폴더 내의 모든 CSV 파일 찾기
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".csv"):  # CSV 파일만 처리
+            file_path = os.path.join(folder_path, filename)
+            print(f"📂 {filename} 처리 중...")
+
+            # CSV 파일 읽기
+            with open(file_path, mode="r", encoding="utf-8") as file:
+                csv_reader = csv.DictReader(file)
+                for row in csv_reader:
+                    # Elasticsearch 문서 형태로 변환
+                    docs.append({"_index": index_name, "_source": row})
+
+            print(f"✅ {filename} 처리 완료")
+
+    # bulk API를 이용해 한 번에 Elasticsearch에 삽입
+    if docs:
+        bulk(es, docs)
+        print(f"🚀 총 {len(docs)}개의 문서 삽입 완료!")
+
+
+# 실행
+index_csv_files(csv_folder, "my-user-index")
--- a/make_dumy.py
+++ b/make_dumy.py
@@ -0,0 +1,32 @@
+import random
+
+import pandas as pd
+from faker import Faker
+
+fake = Faker("ko_KR")
+
+# 생성할 데이터 개수
+n_rows = 1000000  # 원하는 만큼 변경 가능
+
+# 더미 데이터 리스트
+data = []
+for _ in range(n_rows):
+    data.append(
+        {
+            "id": fake.uuid4(),  # 랜덤 UUID
+            "name": fake.name(),  # 랜덤 이름
+            "email": fake.email(),  # 랜덤 이메일
+            "age": random.randint(20, 60),  # 20~60 사이 나이
+            "city": fake.city(),  # 랜덤 도시
+            "created_at": fake.date_time_this_decade(),  # 최근 10년 내 생성 날짜
+        }
+    )
+
+# DataFrame 생성
+df = pd.DataFrame(data)
+
+# CSV 파일로 저장
+csv_filename = "dummy_data2.csv"
+df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
+
+print(f"{csv_filename} 파일이 생성되었습니다! ✅")
--- a/query.py
+++ b/query.py
@@ -0,0 +1,19 @@
+import pprint
+import time
+
+import requests
+
+url = "http://localhost:9200/my-user-index/_search"
+headers = {"Content-Type": "application/json"}
+query = {"query": {"match": {"code": "101015"}}}
+
+start_time = time.time()
+response = requests.get(url, headers=headers, json=query)
+elapsed_time = time.time() - start_time
+
+if response.status_code == 200:
+    result = response.json()
+    pprint.pprint(result)
+    print(f"Query Time: {elapsed_time:.6f} seconds")
+else:
+    print(f"Error: {response.status_code}, {response.text}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,61 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.14
+aiosignal==1.3.2
+asttokens==3.0.0
+async-timeout==5.0.1
+attrs==25.3.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+comm==0.2.2
+datasets==3.4.1
+debugpy==1.8.13
+decorator==5.2.1
+dill==0.3.8
+elastic-transport==8.17.1
+elasticsearch==8.17.2
+exceptiongroup==1.2.2
+executing==2.2.0
+Faker==37.1.0
+filelock==3.18.0
+frozenlist==1.5.0
+fsspec==2024.12.0
+huggingface-hub==0.29.3
+idna==3.10
+ipykernel==6.29.5
+ipython==8.34.0
+jedi==0.19.2
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+matplotlib-inline==0.1.7
+multidict==6.2.0
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+numpy==2.2.4
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.7
+prompt_toolkit==3.0.50
+propcache==0.3.0
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.3.0
+requests==2.32.3
+six==1.17.0
+stack-data==0.6.3
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.3.0
+wcwidth==0.2.13
+xxhash==3.5.0
+yarl==1.18.3