from abc import abstractmethod

import numpy as np

from neural_net.transform_layer import Layer

class ActivationLayer(Layer):
    def __init__(self, index, input_dim, output_dim, weights=None, biases=None):
        super().__init__('ActivationLayer', index, input_dim, output_dim)
        self.type = 'ActivationLayer'
        self.subtype = ''

        self.inputs = np.array([])
        self.output = np.array([])
        self.z = np.array([])
        self.gradient_clip = 1.0

        # Initialize weights and biases
        if weights is not None:
            self.weights = weights
        else:
            self.initialize_weights()
        if biases is not None:
            self.biases = biases
        else:
            self.initialize_biases()

    def describe(self):
        return f"{self.type} ({self.input_dim}x{self.output_dim} neurons, {self.subtype} activation)"

    @abstractmethod
    def initialize_weights(self):
        pass

    @abstractmethod
    def initialize_biases(self):
        pass

    def forward(self, inputs: np.array):
        self.inputs = inputs
        self.z = np.dot(self.inputs, self.weights) + self.biases
        self.output = self.activation(self.z)  # Calls the implemented class's activation function (ie. Sigmoid)
        return self.output

    def backward(self, dL_dout, learning_rate):
        """
        Backpropagate the error and update weights and biases.
        :param dL_dout: Gradient of loss with respect to layer outputs
        :param learning_rate: Learning rate for weight updates
        :return: Gradient with respect to inputs for previous layer (dL/dinputs)
        """
        # Activation derivative dout/dz
        # This tells you how much the output of the activation function changes with respect to the pre-activation value z.
        # Sigmoid derivative formula: σ(z) * (1 - σ(z))
        dout_dz = self.activation_derivative(self.output)

        # Gradient of the loss with respect to weights (dL/dweights)
        # This represents how much the loss changes when the weights change.
        # Formula: dL/dweights = inputs × dL/dout × σ′(z)
        dL_dweights = np.clip(np.dot(self.inputs.T, dL_dout * dout_dz), -self.gradient_clip, self.gradient_clip)

        dL_dbias = np.sum(dL_dout * dout_dz, axis=0)

        # Gradient of the loss with respect to inputs (dL/dinputs)
        # This is the gradient of the loss with respect to the input of the neuron or layer, often needed if you want to backpropagate further.
        # Formula: dL / dinputs = dL/dout × σ′(z) × weights
        dL_dinputs = np.dot(dL_dout * dout_dz, self.weights.T)

        # Clip gradients to prevent them from being too large
        # np.clip(dL_dweights, -10.0, 10.0, out=dL_dweights)
        # np.clip(dL_dbias, -10.0, 10.0, out=dL_dbias)

        # Adjust weights and biases
        self.weights -= learning_rate * dL_dweights
        self.biases -= learning_rate * dL_dbias

        return dL_dinputs, dL_dweights, dL_dbias, self.weights, self.biases

    def reset(self):
        self.initialize_weights()
        self.initialize_biases()

    @abstractmethod
    def activation(self, raw_outputs: np.array):
        """
        Apply the activation function (Sigmoid, ReLU, etc.)
        """
        pass

    @abstractmethod
    def activation_derivative(self, outputs: np.array):
        """
        Compute the derivative of the activation function
        """
        pass