[빅데이터분석기사 실기] 함수 모음집

자격증

[빅데이터분석기사 실기] 함수 모음집 - 유형 2

Suda_777 2024. 5. 30. 00:58

1. 시험 설명

- 유형 2
- 문제 수: 1문제 (40점)
- 주제: 데이터 분석 (분류/회귀/비지도학습)

2. 암기할 클래스 모음

tree 모델

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=1,
                               max_depth=10,
                               min_samples_split=2,
                               min_samples_leaf=1)


from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=1,
                             max_depth=10,
                             min_samples_split=2,
                             min_samples_leaf=1)


from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=1,
                               n_estimators=3,
                               max_depth=10,
                               min_samples_split=2,
                               min_samples_leaf=1)
                               
                               
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=1,
                              n_estimators=3,
                              max_depth=10,
                              min_samples_split=2,
                              min_samples_leaf=1)


import xgboost as xgb
model = xgb.XGBRegressor(random_state=1,
                         n_estimators=3,
                         max_depth=10,
                         min_child_weight=1)


import xgboost as xgb
model = xgb.XGBClassifier(random_state=1,
                         n_estimators=20,
                         max_depth=100,
                         min_child_weight=1)


import lightgbm as lgb
model = lgb.LGBMRegressor(random_state=1,
                         n_estimators=3,
                         max_depth=10,
                         num_leaves=2,
                         min_child_samples=1)


import lightgbm as lgb
model = lgb.LGBMClassifier(random_state=1,
                         n_estimators=3,
                         max_depth=10,
                         num_leaves=2,
                         min_child_samples=1)


from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

선형 모델

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    penalty='l2',         # 규제의 유형: 'l1', 'l2', 'elasticnet', 'none' (기본값은 'l2')
    C=1.0,                # 규제 강도, 값이 작을수록 강한 규제 (기본값은 1.0)
    max_iter=100,         # 최대 반복 횟수 (기본값은 100)
    random_state=1,       # 난수 시드 (재현 가능성을 위해 설정)
    solver='lbfgs',       # 최적화 알고리즘: 'newton-cg', 'lbfgs',
    					  # 'liblinear', 'sag', 'saga' (기본값은 'lbfgs')
)


from sklearn.linear_model import LinearRegression
model = LinearRegression()


from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0) # 0보다큰 숫자, 1이 기본


from sklearn.linear_model import Lasso
model = Lasso(alpha=1.0)


from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=1, # 정규화 강도
	l1_ratio=0.5) # L1, L2비율

Support Vector Machine (SVM)

from sklearn.svm import SVC
model = SVC(C=1.0, # 정규화 매개변수, 작을수록 오류 허용
	kernel='rbf', # linear, poly, sigmoid
    gamma='scale', # 커널의 계수, rbf/poly/sigmoid에서 사용
    probability=True) # 확률 추정

from sklearn.svm import LinearSVC
model = LinearSVC(C=1.0)


from sklearn.svm import SVR
model = SVR(C=1.0, 
	kernel='rbf', 
    gamma='scale')


from sklearn.svm import LinearSVR
model = LinearSVR(C=1.0)

K-근접

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)  # 최근접 이웃 3개 사용

from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=3)  # 최근접 이웃 3개 사용

기타 분류 모델

from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # 선형 판별 분석
from sklearn.neural_network import MLPClassifier # 다층 퍼셉트론

기타 회귀 무델

from sklearn.neural_network import MLPRegressor

비지도 학습 - 군집화(Clustering)

from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)  # 3개의 군집으로 분류


from sklearn.cluster import DBSCAN # 밀도 기반 군집화
model = DBSCAN(eps=0.5, min_samples=5)  # 최대 거리 0.5, 최소 샘플 5개


from sklearn.cluster import AgglomerativeClustering # 계층적 군집화
model = AgglomerativeClustering(n_clusters=3, # 3개의 군집
	linkage='ward') # Ward 연결 방식

모델 검증

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42
)


from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')


from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')


from sklearn.model_selection import RandomizedSearchCV

메트릭

# 회귀 모델 평가 지표
from sklearn.metrics import r2_score
r2 = r2_score(y_true, y_pred)

from sklearn.metrics import log_loss # 로그 손실
log_loss = log_loss(y_true, y_pred_proba)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_true, y_pred)

from math import sqrt
rmse = sqrt(mse)

# 분류모델 평가지표
from sklearn.metrics import accuracy_score # 정확도
accuracy = accuracy_score(y_true, y_pred)

from sklearn.metrics import precision_score # 정밀도
precision = precision_score(y_true, y_pred)

from sklearn.metrics import recall_score # 재현율
recall = recall_score(y_true, y_pred)

from sklearn.metrics import f1_score # F1 점수
f1 = f1_score(y_true, y_pred)

from sklearn.metrics import roc_auc_score # ROC AUC 점수
roc_auc = roc_auc_score(y_true, y_pred_proba)

from sklearn.metrics import confusion_matrix # 혼동 행렬
confusion = confusion_matrix(y_true, y_pred)

from sklearn.metrics import classification_report # 분류 보고서
report = classification_report(y_true, y_pred)

연관분석

# 아프리오리 알고리즘
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
print(frequent_itemsets)


# 연관 규칙
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print(rules)

3. 암기할 파라미터 정리

다음 시간에...

저작자표시 비영리 변경금지