first commit

2025-03-27 16:11:09 +09:00
parent bd308ea2df
commit 1d1d4e62b2
9 changed files with 300 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 es
 split_KCS
--- a/README.md
+++ b/README.md
@@ -8,8 +8,6 @@
 ### 설치
 #### 도커
 ```text
 docker compose build  
 docker compose up
@@ -17,10 +15,8 @@ docker compose up
 ## 사용하기
-### API
+### query.py
 ```text
 query.py
 query = {"query": {"match": {"code": "101015"}}} 수정 후 질의
 ```
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,15 @@
 version: '3.8'
 services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.4
    container_name: myes
    environment:
      - discovery.type=single-node
    ports:
      - "9200:9200"  
      - "9300:9300"
    # volumes:
    #   - esdata:/usr/share/elasticsearch/data 
 # volumes:
 #  esdata:
--- a/7
+++ b/7
@@ -0,0 +1,7 @@
 FROM docker.elastic.co/elasticsearch/elasticsearch:7.17.4
 ENV discovery.type=single-node
 EXPOSE 9200 9300
 CMD ["elasticsearch"]
--- a/dumy_insert.py
+++ b/dumy_insert.py
@@ -0,0 +1,123 @@
 import csv
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
 # --- 설정 ---
 ELASTICSEARCH_HOSTS = ["http://localhost:9200"]  # Elasticsearch 주소 (필요시 수정)
 INDEX_NAME = "my-user-index"  # 사용할 인덱스 이름
 CSV_FILE_PATH = "dummy_data2.csv"  # CSV 파일 경로
 # --- 설정 끝 ---
 # Elasticsearch 클라이언트 생성
 try:
    es = Elasticsearch(ELASTICSEARCH_HOSTS)
    # 연결 테스트 (선택 사항)
    if not es.ping():
        raise ValueError("Elasticsearch 연결 실패!")
    print("Elasticsearch 연결 성공!")
 except Exception as e:
    print(f"Elasticsearch 연결 중 오류 발생: {e}")
    exit()
 def generate_actions(csv_filepath, index_name):
    """CSV 파일을 읽어 Elasticsearch bulk API 액션을 생성합니다."""
    try:
        with open(csv_filepath, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            header = next(reader)  # 헤더 읽기
            for row in reader:
                if not row:  # 빈 줄 건너뛰기
                    continue
                doc = {}
                try:
                    # 헤더와 데이터 매핑
                    doc = dict(zip(header, row))
                    # 데이터 타입 변환 (age를 정수로)
                    if "age" in doc and doc["age"]:  # 값이 있을 때만 변환 시도
                        try:
                            doc["age"] = int(doc["age"])
                        except ValueError:
                            print(
                                f"경고: 'age' 필드 정수 변환 실패 (값: {doc['age']}). 행: {row}"
                            )
                            # age 변환 실패 시 처리: None으로 설정하거나, 로그만 남기고 진행하거나 선택
                            doc["age"] = None  # 예시: None으로 설정
                    elif "age" in doc and not doc["age"]:
                        doc["age"] = None  # 빈 문자열이면 None으로 설정
                    # created_at은 Elasticsearch가 자동으로 date 타입으로 인식할 수 있음
                    # (인덱스 매핑 설정에 따라 다를 수 있음)
                    # Bulk API 형식에 맞게 생성
                    yield {
                        "_index": index_name,
                        "_id": doc.get("id"),  # CSV의 id를 Elasticsearch 문서 ID로 사용
                        "_source": doc,
                    }
                except ValueError as ve:
                    print(f"데이터 변환 오류 (행 건너뜀): {row} - {ve}")
                    continue
                except Exception as ex:
                    print(f"행 처리 중 오류 발생 (행 건너뜀): {row} - {ex}")
                    continue
    except FileNotFoundError:
        print(f"오류: CSV 파일 '{csv_filepath}'를 찾을 수 없습니다.")
        exit()
    except Exception as e:
        print(f"CSV 파일 읽기 중 오류 발생: {e}")
        exit()
 # --- 실행 ---
 print(
    f"'{CSV_FILE_PATH}' 파일에서 데이터를 읽어 '{INDEX_NAME}' 인덱스로 벌크 삽입을 시작합니다..."
 )
 try:
    # bulk 함수를 사용하여 데이터 삽입
    success, failed_items = bulk(
        es,
        generate_actions(CSV_FILE_PATH, INDEX_NAME),
        chunk_size=500,
        request_timeout=60,
        raise_on_error=False,
        raise_on_exception=False,
    )  # 에러 발생 시 중단하지 않도록 설정 추가
    # 실패 건수 계산 (failed_items 리스트의 길이)
    num_failed = len(failed_items)
    print(f"벌크 삽입 완료: 성공={success}, 실패={num_failed}")
    # *** 수정된 부분 ***
    # failed_items 리스트가 비어있지 않은지 확인 (즉, 실패한 항목이 있는지 확인)
    if failed_items:  # 또는 if num_failed > 0:
        print(
            f"실패한 항목 {num_failed}개가 있습니다. 상세 내용은 아래와 같습니다 (일부만 표시될 수 있음):"
        )
        # 실패 상세 내용 출력 (예: 처음 5개만)
        for i, item in enumerate(failed_items):
            if i < 5:
                print(f"  - {item}")
            else:
                print(f"  ... (총 {num_failed}개 중 {i}개 표시)")
                break
        # 필요하다면 모든 실패 항목을 로그 파일 등에 기록할 수 있습니다.
    else:
        # 인덱스 새로고침 (데이터가 검색 가능하도록) - 실패가 없을 때만 수행하거나 항상 수행하도록 선택 가능
        try:
            es.indices.refresh(index=INDEX_NAME)
            print(f"인덱스 '{INDEX_NAME}' 새로고침 완료.")
        except Exception as refresh_err:
            print(f"인덱스 새로고침 중 오류 발생: {refresh_err}")
 except Exception as e:
    # bulk 함수 자체에서 예외가 발생한 경우 (예: 연결 문제)
    print(f"벌크 삽입 중 예상치 못한 오류 발생: {e}")
--- a/kcs_insert.py
+++ b/kcs_insert.py
@@ -0,0 +1,40 @@
 import csv
 import os
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
 # Elasticsearch 클라이언트 생성
 es = Elasticsearch("http://localhost:9200")
 # CSV 파일이 있는 폴더 경로 설정
 csv_folder = r"split_KCS"  # CSV 파일들이 들어 있는 폴더
 # Elasticsearch에 데이터 삽입 함수
 def index_csv_files(folder_path, index_name):
    docs = []  # bulk 삽입을 위한 리스트
    # 폴더 내의 모든 CSV 파일 찾기
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):  # CSV 파일만 처리
            file_path = os.path.join(folder_path, filename)
            print(f"📂 {filename} 처리 중...")
            # CSV 파일 읽기
            with open(file_path, mode="r", encoding="utf-8") as file:
                csv_reader = csv.DictReader(file)
                for row in csv_reader:
                    # Elasticsearch 문서 형태로 변환
                    docs.append({"_index": index_name, "_source": row})
            print(f"✅ {filename} 처리 완료")
    # bulk API를 이용해 한 번에 Elasticsearch에 삽입
    if docs:
        bulk(es, docs)
        print(f"🚀 총 {len(docs)}개의 문서 삽입 완료!")
 # 실행
 index_csv_files(csv_folder, "my-user-index")
--- a/make_dumy.py
+++ b/make_dumy.py
@@ -0,0 +1,32 @@
 import random
 import pandas as pd
 from faker import Faker
 fake = Faker("ko_KR")
 # 생성할 데이터 개수
 n_rows = 1000000  # 원하는 만큼 변경 가능
 # 더미 데이터 리스트
 data = []
 for _ in range(n_rows):
    data.append(
        {
            "id": fake.uuid4(),  # 랜덤 UUID
            "name": fake.name(),  # 랜덤 이름
            "email": fake.email(),  # 랜덤 이메일
            "age": random.randint(20, 60),  # 20~60 사이 나이
            "city": fake.city(),  # 랜덤 도시
            "created_at": fake.date_time_this_decade(),  # 최근 10년 내 생성 날짜
        }
    )
 # DataFrame 생성
 df = pd.DataFrame(data)
 # CSV 파일로 저장
 csv_filename = "dummy_data2.csv"
 df.to_csv(csv_filename, index=False, encoding="utf-8-sig")
 print(f"{csv_filename} 파일이 생성되었습니다! ✅")
--- a/query.py
+++ b/query.py
@@ -0,0 +1,19 @@
 import pprint
 import time
 import requests
 url = "http://localhost:9200/my-user-index/_search"
 headers = {"Content-Type": "application/json"}
 query = {"query": {"match": {"code": "101015"}}}
 start_time = time.time()
 response = requests.get(url, headers=headers, json=query)
 elapsed_time = time.time() - start_time
 if response.status_code == 200:
    result = response.json()
    pprint.pprint(result)
    print(f"Query Time: {elapsed_time:.6f} seconds")
 else:
    print(f"Error: {response.status_code}, {response.text}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,61 @@
 aiohappyeyeballs==2.6.1
 aiohttp==3.11.14
 aiosignal==1.3.2
 asttokens==3.0.0
 async-timeout==5.0.1
 attrs==25.3.0
 certifi==2025.1.31
 charset-normalizer==3.4.1
 comm==0.2.2
 datasets==3.4.1
 debugpy==1.8.13
 decorator==5.2.1
 dill==0.3.8
 elastic-transport==8.17.1
 elasticsearch==8.17.2
 exceptiongroup==1.2.2
 executing==2.2.0
 Faker==37.1.0
 filelock==3.18.0
 frozenlist==1.5.0
 fsspec==2024.12.0
 huggingface-hub==0.29.3
 idna==3.10
 ipykernel==6.29.5
 ipython==8.34.0
 jedi==0.19.2
 jupyter_client==8.6.3
 jupyter_core==5.7.2
 matplotlib-inline==0.1.7
 multidict==6.2.0
 multiprocess==0.70.16
 nest-asyncio==1.6.0
 numpy==2.2.4
 packaging==24.2
 pandas==2.2.3
 parso==0.8.4
 pexpect==4.9.0
 platformdirs==4.3.7
 prompt_toolkit==3.0.50
 propcache==0.3.0
 psutil==7.0.0
 ptyprocess==0.7.0
 pure_eval==0.2.3
 pyarrow==19.0.1
 Pygments==2.19.1
 python-dateutil==2.9.0.post0
 pytz==2025.2
 PyYAML==6.0.2
 pyzmq==26.3.0
 requests==2.32.3
 six==1.17.0
 stack-data==0.6.3
 tornado==6.4.2
 tqdm==4.67.1
 traitlets==5.14.3
 typing_extensions==4.12.2
 tzdata==2025.2
 urllib3==2.3.0
 wcwidth==0.2.13
 xxhash==3.5.0
 yarl==1.18.3