# 감정 분석 classifier = pipeline("sentiment-analysis") result = classifier("I love Hugging Face!") print(result) # [{'label': 'POSITIVE', 'score': 0.9998}]
# 여러 문장 처리 results = classifier([ "I love this!", "I hate this!", "This is okay." ]) for result in results: print(result)
2.2 다양한 태스크
# 텍스트 생성 generator = pipeline("text-generation", model="gpt2") result = generator("Once upon a time", max_length=50, num_return_sequences=2) print(result)
# 질의응답 qa_pipeline = pipeline("question-answering") context = "Hugging Face is a company based in New York and Paris." question = "Where is Hugging Face based?" result = qa_pipeline(question=question, context=context) print(result) # {'score': 0.98, 'start': 39, 'end': 58, 'answer': 'New York and Paris'}
# 요약 summarizer = pipeline("summarization") text = """ The Hugging Face Hub is a platform with over 120,000 models, 20,000 datasets, and 50,000 demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together. """ summary = summarizer(text, max_length=50, min_length=10) print(summary)
# 번역 translator = pipeline("translation_en_to_fr") result = translator("Hello, how are you?") print(result) # [{'translation_text': 'Bonjour, comment allez-vous?'}]
# 개체명 인식 (NER) ner = pipeline("ner", grouped_entities=True) result = ner("My name is John and I live in New York.") print(result)
# 제로샷 분류 classifier = pipeline("zero-shot-classification") result = classifier( "This is a course about Python programming.", candidate_labels=["education", "politics", "business"] ) print(result)
# 이미지 분류 image_classifier = pipeline("image-classification") result = image_classifier("path/to/image.jpg") print(result)
# 음성 인식 transcriber = pipeline("automatic-speech-recognition") result = transcriber("audio.mp3") print(result)
2.3 특정 모델 지정
# 한국어 감정 분석 classifier = pipeline( "sentiment-analysis", model="matthewburke/korean_sentiment" )
# GPT-3.5 스타일 모델 generator = pipeline( "text-generation", model="meta-llama/Llama-2-7b-chat-hf" )
# BERT 기반 질의응답 qa = pipeline( "question-answering", model="deepset/roberta-base-squad2" )
3. 모델과 토크나이저 사용
3.1 기본 사용법
from transformers import AutoTokenizer, AutoModel
# 모델과 토크나이저 로드 model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name)
# 텍스트 토크나이징 text = "Hello, Hugging Face!" inputs = tokenizer(text, return_tensors="pt") print(inputs) # {'input_ids': tensor([[...]]), 'attention_mask': tensor([[...]])}
# 전체 데이터셋에 적용 tokenized_dataset = dataset.map(preprocess_function, batched=True)
4.3 Trainer를 사용한 파인튜닝
from transformers import Trainer, TrainingArguments from transformers import AutoModelForSequenceClassification import numpy as np from datasets import load_metric
# 모델 로드 model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2 )
# 모델 검색 models = list(list_models(filter="text-classification", sort="downloads")) for model in models[:5]: print(f"{model.modelId}: {model.downloads} downloads")
# 모델 정보 가져오기 model_info = api.model_info("bert-base-uncased") print(model_info.tags) print(model_info.pipeline_tag)
# 모델 다운로드 from huggingface_hub import snapshot_download
# 또는 huggingface-cli 사용 # huggingface-cli upload username/my-model ./my-model
6.3 모델 카드 작성
# README.md 생성 model_card = """ --- language: en license: apache-2.0 tags: - text-classification - sentiment-analysis datasets: - imdb metrics: - accuracy --- # My Awesome Model This model is fine-tuned for sentiment analysis. ## Usage ```python from transformers import pipeline classifier = pipeline("sentiment-analysis", model="username/my-awesome-model") result = classifier("I love this!")
Training
Trained on IMDB dataset with the following parameters:
Learning rate: 2e-5
Batch size: 16
Epochs: 3 “””
with open(“./my-model/README.md”, “w”) as f: f.write(model_card)
## 7. 고급 기능
### 7.1 양자화 (Quantization)
메모리 사용량을 줄이고 추론 속도를 높입니다.
```python from transformers import AutoModelForCausalLM, BitsAndBytesConfig
defbatch_analyze(self, texts): results = [] for text in texts: results.append(self.analyze(text)) return results
# 사용 analyzer = SentimentAnalyzer() result = analyzer.analyze("This product is amazing!") print(result)
# 배치 처리 texts = [ "I love this!", "This is terrible.", "It's okay, nothing special." ] results = analyzer.batch_analyze(texts) for text, result inzip(texts, results): print(f"{text}: {result['label']} ({result['confidence']:.2%})")
8.2 커스텀 챗봇
from transformers import AutoModelForCausalLM, AutoTokenizer import torch
print("Chatbot: Hello! How can I help you?") whileTrue: user_input = input("You: ") if user_input.lower() in ["quit", "exit", "bye"]: print("Chatbot: Goodbye!") break
from transformers import pipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings
classDocumentQA: def__init__(self): # 임베딩 모델 self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" )
documents = [ """ Hugging Face is a company that develops tools for building applications using machine learning. It is most notable for its transformers library built for natural language processing applications and its platform that allows users to share machine learning models and datasets. """, """ The Transformers library provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio. These models can be applied on: Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation. """ ]
qa_system.load_documents(documents)
question = "What is Hugging Face?" result = qa_system.answer(question)
# 디코딩 captions = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) return [caption.strip() for caption in captions]
# 사용 captioner = ImageCaptioner() captions = captioner.generate_caption("image.jpg", num_captions=3)
for i, caption inenumerate(captions, 1): print(f"Caption {i}: {caption}")
9. Spaces - 모델 데모 호스팅
9.1 Gradio 앱 생성
pip install gradio
# app.py import gradio as gr from transformers import pipeline
# 파이프라인 로드 classifier = pipeline("sentiment-analysis")
defanalyze_sentiment(text): result = classifier(text)[0] returnf"{result['label']}: {result['score']:.2%}"
# Gradio 인터페이스 interface = gr.Interface( fn=analyze_sentiment, inputs=gr.Textbox(lines=5, placeholder="Enter text here..."), outputs="text", title="Sentiment Analysis", description="Analyze the sentiment of any text!" )
if __name__ == "__main__": interface.launch()
9.2 Hugging Face Spaces에 배포
# 1. Space 생성 (웹에서) # https://huggingface.co/new-space
# 2. 로컬에서 클론 git clone https://huggingface.co/spaces/username/my-space cd my-space
# 3. 파일 추가 # - app.py (메인 코드) # - requirements.txt # - README.md
# 파이프라인 로드 @st.cache_resource defload_model(name): return pipeline("sentiment-analysis", model=name)
classifier = load_model(model_name)
# 입력 text = st.text_area("Enter text to analyze:", height=150)
# 분석 버튼 if st.button("Analyze"): if text: with st.spinner("Analyzing..."): result = classifier(text)[0]
st.success("Analysis Complete!")
col1, col2 = st.columns(2) with col1: st.metric("Label", result["label"]) with col2: st.metric("Confidence", f"{result['score']:.2%}") else: st.warning("Please enter some text!")
# 실행: streamlit run app.py
10. 베스트 프랙티스
10.1 메모리 최적화
# 1. 작은 배치 사이즈 사용 training_args = TrainingArguments( per_device_train_batch_size=4, gradient_accumulation_steps=4# 효과적으로 배치 크기 16 )
from transformers import pipeline from requests.exceptions import HTTPError
defsafe_inference(text, max_retries=3): for attempt inrange(max_retries): try: result = classifier(text) return result except HTTPError as e: if attempt < max_retries - 1: print(f"Retry {attempt + 1}/{max_retries}") continue else: raise except Exception as e: print(f"Error: {e}") returnNone
# 텍스트 길이 확인 defvalidate_input(text, max_length=512): tokenized = tokenizer.tokenize(text) iflen(tokenized) > max_length: print(f"Warning: Text too long ({len(tokenized)} tokens). Truncating...") returnTrue returnFalse