기계학습_선형회귀 모델, 경사 하강법 실습 with python

경사 하강법 실습

$Y=4+3X+\epsilon$ ($0\le X \le 2$)를 따르는 샘플 100개를 생성한 후 선형회귀를 적용
SVD를 이용한 선형회귀인 LinearRegression, SGD를 이용한 선형회귀인 SGDRegressor, 배치 경사 하강법, 미니배치 경사 하강법을 비교 실습

# 데이터 샘플 생성 

import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

import matplotlib.pyplot as plt 
%matplotlib inline

np.random.seed(20) #랜덤시드 고정

X = 2 * np.random.rand(100,1) #0~1의 균일분포 표준정규분포 난수를 matrix array(m,n) 생성
y = 4 + 3 * X + np.random.randn(100,1) #평균0, 표준편차1의 가우시안 표준정규분포 난수를 matrix array(m,n) 생성

plt.plot(X,y, 'b.') #파란색으로 산점도 그리기
plt.xlabel("$x_1$", fontsize=10)
plt.ylabel("$y$", rotation=0, fontsize=10)
plt.axis([0, 2, 0, 15]) #plt.axis([xmin,xmax,ymin,ymax]) 

# 각 특성벡터의 첫 번째 좌표에 bias에 대응되는 1을 추가하여 Xb로 수정 
# 여러 가지 경사 하강법을 직접 구현해 볼 때 사용

Xb = np.column_stack((np.ones((100,1)), X))  ##배열을 열로 만들어서 붙이기

X_new = np.array([[0],[2]])
Xb_new = np.column_stack((np.ones((2,1)), X_new))

LinearRegression

n_reg_svd = LinearRegression() 
lin_reg_svd.fit(X, y)
theta_svd = np.array([lin_reg_svd.intercept_, lin_reg_svd.coef_],dtype=float) #intercept_: 편향, coef_: 가중치
theta_svd 

#array([[3.8115615 ],
       [3.23892352]])
       
y_predict_svd = lin_reg_svd.predict(X_new)
y_predict_svd

#array([[ 3.8115615 ],
       [10.28940854]])
    
# 모델의 예측을 그래프에 나타내기 
plt.plot(X, y, 'b.') #파란색 점으로 산점도
plt.plot(X_new, y_predict_svd, 'r-') #빨간색 선으로 회귀직선 그리기
plt.xlabel("$x_1$", fontsize=10) #x좌표축
plt.ylabel("$y$", rotation=0, fontsize=10) #y좌표축
plt.axis([0, 2, 0, 15]) 
plt.title("Linear regression with SVD") #제목

배치 경사 하강법_ 직접 구현

그래디언트를 구하는 식 — (2)

\[(\nabla_ L({\theta})) = \dfrac 2 m \mathbf X^{\rm T}(\mathbf X{\theta}-\mathbf y)=\mathbf 0\]

${\theta}$를 업데이트하는 식 — (3)

\[{\theta}^{(t+1)}:= {\theta}^{(t)}-\alpha \nabla_L({\theta}^{(t)})\ (t\ge 0)\]

eta = 0.1 # 학습률 
n_epochs = 1000 # epoch 수 
m = 100 # 샘플수 

theta_bgd = np.random.randn(2,1)  # 무작위로 theta 초깃값 설정 

for iteration in range(n_epochs):
    gradients = 2/m * Xb.T.dot(Xb.dot(theta_bgd) -y)   # 설명의 식 (2) 구현
    theta_bgd = theta_bgd - eta * gradients                # 설명의 식 (3) 구현 
    
print(f"theta_bgd:\n {theta_bgd}")
print()
print(f"theta_svd:\n {theta_svd}")

#theta_bgd:
 [[3.8115615 ]
 [3.23892352]]

#theta_svd:
 [[3.8115615 ]
 [3.23892352]]

y_predict_bgd = Xb_new.dot(theta_bgd)
plt.plot(X, y, 'b.')
plt.plot(X_new, y_predict_bgd, 'r-')
plt.xlabel("$x_1$", fontsize=10)
plt.ylabel("$y$", rotation=0, fontsize=10)
plt.axis([0, 2, 0, 15])
plt.title(f"Linear regression with BGD (learning rate={eta})")

확률적 경사 하강법_ 직접 구현

theta_sgd_path = []
n_epochs = 50
t0, t1 = 5, 50  # 학습 스케줄 하이퍼파라미터 
m = 100         # 샘플 수 

def learning_schedule(t): #학습률 설정
    return t0/(t+t1)

theta_sgd = np.random.randn(2,1)  # theta 무작위 초기화 

for epoch in range(n_epochs):
    for i in range(m):
        random_idx = np.random.randint(m) #0~m-1 사이 정수 하나 반환, size로 반환 갯수 설정 가능
        tx = Xb[random_idx:random_idx+1]
        ty = y[random_idx:random_idx+1] #랜덤하게 한 점 선택
        gradients = 2 * tx.T.dot(tx.dot(theta_sgd) - ty)
        eta = learning_schedule(epoch * m + i)
        theta_sgd = theta_sgd - eta * gradients
        theta_sgd_path.append(theta_sgd)

확률적 경사 하강법 그림으로 학습 과정 파악하기

theta_sgd = np.random.randn(2,1)  # 랜덤 초기화
plt.figure(figsize=(5,4))

for i in range(21):
    y_predict_sgd = Xb_new.dot(theta_sgd)            
    if i==0:        
        plt.plot(X_new, y_predict_sgd, "m-", linewidth=1)
    else:
        plt.plot(X_new, y_predict_sgd, "g-", linewidth=1, alpha=0.05*i)
    random_idx = np.random.randint(m)
    tx = Xb[random_idx:random_idx+1]
    ty = y[random_idx:random_idx+1]
    gradients = 2 * tx.T.dot(tx.dot(theta_sgd) - ty)
    eta = learning_schedule(i)
    theta_sgd = theta_sgd - eta * gradients
    
plt.plot(X, y, "b.")                                 
plt.xlabel("$x_1$", fontsize=10)                      
plt.ylabel("$y$", rotation=0, fontsize=10)           
plt.axis([0, 2, 0, 15])     

확률적 경사 하강법_ sklearn.linear_model 모듈의 SGDRegressor 활용

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X,y.ravel()) # 2차원 배열 y를 1차원 배열로

sgd_reg.intercept_, sgd_reg.coef_
#(array([3.75972497]), array([3.16465032]))

미니배치 경사 하강법_직접 구현

theta_mgd_path = []

n_epochs = 50
minibatch_size = 20

np.random.seed(42)
theta_mgd = np.random.randn(2,1)  # 랜덤 초기화

t0, t1 = 200, 1000
def learning_schedule(t):
    return t0 / (t + t1)

t = 0
for epoch in range(n_epochs):
    shuffled_indices = np.random.permutation(m)
    Xb_shuffled = Xb[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for i in range(0, m, minibatch_size):
        t += 1
        tx = Xb_shuffled[i:i+minibatch_size]
        ty = y_shuffled[i:i+minibatch_size]
        gradients = 2/minibatch_size * tx.T.dot(tx.dot(theta_mgd) - ty)
        eta = learning_schedule(t)
        theta_mgd = theta_mgd - eta * gradients
        theta_mgd_path.append(theta_mgd)
        
print(f"theta_mgd:\n {theta_mgd}")

#theta_mgd:
 [[3.80536591]
 [3.25424808]]

일반화 오차에 대한 편향-분산 분해

동일한 학습모델이라도 서로 다른 훈련 데이터셋을 통해 학습한 결과는 다를 수 있음 (훈련 샘플이 같은 분포에서 샘플링 된 경우라도 마찬가지)
검증 데이터셋을 통해 학습모델의 일반화 성능을 예측할 때, 왜 이런 성능을 얻게 되었는지를 분석하기 위해서 편향-분산 분해(bias-variance decomposition)를 이해하는 것이 필요함
회귀문제에서 데이터셋 $D$의 샘플 $\mathbf x$에 대응되는 실제 레이블을 $y_{\text{r}}$, 정답 레이블을 $y_{D}$, $D$의 훈련 데이터셋에 의해 학습된 모델의 예측 레이블을 $f(\mathbf x;D)$라고 나타내자. (주어진 데이터셋의 일부는 훈련 데이터셋, 나머지는 테스트 데이터셋)
\[즉, y_{D}= y_{\text{r}}+\text{noise}\]
데이터 샘플로 주어지는 것은 $(\mathbf x,y_{D})$이고 편의상 $\text{E}(y_{D})=y_{r}$이라 가정 (이때, 기댓값은 서로 다른 데이터셋에 대한 평균)
검증(또는 테스트) 샘플 $\mathbf x$에 대해, 똑같은 샘플 수의 서로 다른 훈련 세트를 사용하여 얻어진 분산 $\text{E}_D(f(\mathbf x;D) -y_D)^2)$ 으로 학습 모델 $f$ 에 대한 일반화된 오차를 이해해보자.
$\bar f(\mathbf x)=\text{E}_D(f(\mathbf x;D))$라 할 때,

\[\text{E}_D((f(\mathbf x;D)-y_D)^2) =\text{E}_D((f(\mathbf x;D)-\bar f(\mathbf x))^2)+(\bar f(\mathbf x)-y_{\text{r}})^2+\text{E}_D((y_{\text{r}}-y_D)^2)\]

위 식과 같이 일반화된 오차를 분해하는 것을 편향-분산(-노이즈) 분해라고 함.

분해식의 첫번째는 샘플 x에 대한 학습 모델 $f$ 의 예측에 대한 분산
두번째는 샘플 x에 대한 학습 모델 $f$ 의 예측에 대한 편향
세번째는 노이즈
- 분산은 크기가 같은 훈련 데이터셋이 바뀔 때 발생하는 성능의 변화를 나타냄
- 편향은 학습 알고리즘의 기대 예측값이 실제 데이터에서 떨어진 정도를 나타내고, 학습 모델의 적합 능력을 나타냄
- 노이즈는 어떤 학습 모델을 사용하더라도 극복할 수 없는 일반화의 오차의 하계를 뜻함. 즉 학습의 본질적 어려움을 나타냄
일반적으로 편향과 분산은 상충 관계에 있음(편향과 분산의 딜레마)
과소적합 이면 편향이 큰 것; 모델을 잘못 설정(가정)했기 때문에 편향이 커진다.
과대적합이면 분산이 큰 것; 학습 모델의 복잡도가 크다는 의미와 같으므로 큰 분산을 가진다.
노이즈는 줄일 수 없는 오차; 데이터 정제, 잡음 제거 등의 과정이 필요하다.
훈련 데이터셋에 대한 오차와 검증 데이터셋에 대한 오차가 모두 크면 과소적합되었을 가능성이 크며, 이 경우 학습 모델의 복잡도를 높이는 것이 도움이 된다.
훈련 데이터셋에 대한 오차보다 검증 데이터셋에 대한 오차가 크면 과대적합되었을 가능성이 크며, 규제를 적용하면 도움이 된다.

Reference:

기계학습_하길찬 교수님 수업을 바탕으로 공부한 내용입니다.

← Previous Post Next Post →

경사 하강법 실습