vault backup: 2025-01-11 15:28:19
This commit is contained in:
parent
89bdca19a6
commit
8d1be3fedd
3 changed files with 185 additions and 129 deletions
160
.obsidian/workspace.json
vendored
160
.obsidian/workspace.json
vendored
|
@ -8,128 +8,31 @@
|
|||
"type": "tabs",
|
||||
"children": [
|
||||
{
|
||||
"id": "34cfdf6e8485642e",
|
||||
"id": "a1bfc487c4cf997d",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "markdown",
|
||||
"state": {
|
||||
"file": "Foundation of data science/notes/8 Variational Autoencoders.md",
|
||||
"file": "Foundation of data science/notes/9 XGBoost.md",
|
||||
"mode": "source",
|
||||
"source": false
|
||||
},
|
||||
"icon": "lucide-file",
|
||||
"title": "8 Variational Autoencoders"
|
||||
"title": "9 XGBoost"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "dee6b7fc799ba9d4",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE6_Face recognition2D.pdf",
|
||||
"page": 72,
|
||||
"left": -373,
|
||||
"top": 539,
|
||||
"zoom": 0.8
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE6_Face recognition2D"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "6dd2d86707236bd3",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE11_Fingerprints.pdf",
|
||||
"page": 12,
|
||||
"left": -173,
|
||||
"top": 252,
|
||||
"zoom": 1.1
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE11_Fingerprints"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "6e01744dc56c5469",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE4_Face introduction and localization.pdf",
|
||||
"page": 6,
|
||||
"left": -129,
|
||||
"top": 482,
|
||||
"zoom": 1.2
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE4_Face introduction and localization"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "2854bd3b52a5b340",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE12_MULBIOMETRIC.pdf",
|
||||
"page": 1,
|
||||
"left": -8,
|
||||
"top": 552,
|
||||
"zoom": 1.9210084033613448
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE12_MULBIOMETRIC"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "c74d17cfbe39864e",
|
||||
"id": "990d879cbcbc04b3",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "markdown",
|
||||
"state": {
|
||||
"file": "Biometric Systems/notes/2. Performance indexes.md",
|
||||
"file": "Foundation of data science/notes/9 Gradient Boosting.md",
|
||||
"mode": "source",
|
||||
"source": false
|
||||
},
|
||||
"icon": "lucide-file",
|
||||
"title": "2. Performance indexes"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "a04fd1765772f474",
|
||||
"type": "leaf",
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE8_Face antispoofing.pdf",
|
||||
"page": 31,
|
||||
"left": -226,
|
||||
"top": 539,
|
||||
"zoom": 1
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE8_Face antispoofing"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "b0046265ddcac2a4",
|
||||
"type": "leaf",
|
||||
"dimension": 14.285714285714286,
|
||||
"state": {
|
||||
"type": "pdf",
|
||||
"state": {
|
||||
"file": "Biometric Systems/slides/LEZIONE2_Indici_di_prestazione.pdf",
|
||||
"page": 36,
|
||||
"left": -109,
|
||||
"top": 380,
|
||||
"zoom": 1.25
|
||||
},
|
||||
"icon": "lucide-file-text",
|
||||
"title": "LEZIONE2_Indici_di_prestazione"
|
||||
"title": "9 Gradient Boosting"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
@ -302,46 +205,45 @@
|
|||
"companion:Toggle completion": false
|
||||
}
|
||||
},
|
||||
"active": "34cfdf6e8485642e",
|
||||
"active": "a1bfc487c4cf997d",
|
||||
"lastOpenFiles": [
|
||||
"Foundation of data science/slides/Traditional discriminative approaches.pdf",
|
||||
"Foundation of data science/notes/9 XGBoost.md",
|
||||
"Foundation of data science/notes/Untitled.md",
|
||||
"Untitled",
|
||||
"Foundation of data science/notes/9 Gradient Boosting.md",
|
||||
"Foundation of data science/notes/9 Random Forest.md",
|
||||
"Foundation of data science/notes/9 Decision tree.md",
|
||||
"Biometric Systems/slides/Riassunto_2021_2022.pdf",
|
||||
"Biometric Systems/slides/LEZIONE3_Affidabilita_del_riconoscimento.pdf",
|
||||
"Biometric Systems/slides/LEZIONE4_Face introduction and localization.pdf",
|
||||
"Biometric Systems/slides/LEZIONE11_Fingerprints.pdf",
|
||||
"Biometric Systems/slides/LEZIONE6_Face recognition2D.pdf",
|
||||
"Foundation of data science/slides/FDS_backprop_new.pdf",
|
||||
"Foundation of data science/slides/FDS_backprop_new 1.pdf",
|
||||
"Foundation of data science/notes/8 Variational Autoencoders.md",
|
||||
"Foundation of data science/slides/Variational Autoencoders.pdf",
|
||||
"Foundation of data science/notes/7 Autoencoders.md",
|
||||
"Biometric Systems/notes/2. Performance indexes.md",
|
||||
"Biometric Systems/notes/8 Face anti spoofing.md",
|
||||
"Biometric Systems/notes/13. Multi biometric.md",
|
||||
"Biometric Systems/notes/11. Fingerprints.md",
|
||||
"Biometric Systems/notes/3. Recognition Reliability.md",
|
||||
"Biometric Systems/notes/6. Face recognition 2D.md",
|
||||
"Biometric Systems/notes/4. Face detection.md",
|
||||
"Foundation of data science/notes/1 CV Basics.md",
|
||||
"Foundation of data science/notes/6 PCA.md",
|
||||
"Foundation of data science/notes/5 Neural Networks.md",
|
||||
"Biometric Systems/slides/LEZIONE11_Fingerprints.pdf",
|
||||
"Biometric Systems/notes/11. Fingerprints.md",
|
||||
"Biometric Systems/slides/LEZIONE2bis_Indici_di_prestazione.pdf",
|
||||
"Biometric Systems/slides/LEZIONE2_Indici_di_prestazione.pdf",
|
||||
"Biometric Systems/notes/2. Performance indexes.md",
|
||||
"Biometric Systems/notes/3. Recognition Reliability.md",
|
||||
"Biometric Systems/notes/4. Face detection.md",
|
||||
"Biometric Systems/notes/6. Face recognition 2D.md",
|
||||
"Biometric Systems/slides/Biometric_System___Notes.pdf",
|
||||
"Biometric Systems/slides/LEZIONE6_Face recognition2D.pdf",
|
||||
"Biometric Systems/slides/LEZIONE4_Face introduction and localization.pdf",
|
||||
"Biometric Systems/slides/LEZIONE12_MULBIOMETRIC.pdf",
|
||||
"Biometric Systems/slides/LEZIONE8_Face antispoofing.pdf",
|
||||
"Biometric Systems/notes/1. Introduction.md",
|
||||
"Biometric Systems/notes/8 Face anti spoofing.md",
|
||||
"Foundation of data science/notes/3.2 LLM generated from notes.md",
|
||||
"Foundation of data science/notes/4 L1 and L2 normalization - Lasso and Ridge.md",
|
||||
"Foundation of data science/notes/3.1 Multi Class Logistic Regression.md",
|
||||
"Foundation of data science/notes/3 Logistic Regression.md",
|
||||
"Foundation of data science/notes/2 Linear Regression.md",
|
||||
"Foundation of data science/notes/Untitled.md",
|
||||
"Foundation of data science/notes/9 Random Forest.md",
|
||||
"Foundation of data science/notes/9 K-Nearest Neighbors.md",
|
||||
"Foundation of data science/notes/9 Gradient Boosting.md",
|
||||
"Foundation of data science/slides/multiclass_crossentropy_biasvariance.pdf",
|
||||
"Foundation of data science/slides/More on Neural Networks (1).pdf",
|
||||
"Biometric Systems/notes/multi bio.md",
|
||||
"Biometric Systems/notes/13. Multi biometric.md",
|
||||
"Biometric Systems/notes/7. Face recognition 3D.md",
|
||||
"Biometric Systems/notes/9. Ear recognition.md",
|
||||
"Biometric Systems/images/Pasted image 20241228171617.png",
|
||||
"Biometric Systems/images/Pasted image 20241228174722.png",
|
||||
"Biometric Systems/notes/12. Iris recognition.md",
|
||||
"Foundation of data science/notes/9 Decision tree.md",
|
||||
"Foundation of data science/notes/8 Variational Autoencoders.md",
|
||||
"Biometric Systems/images/Pasted image 20241217025904.png",
|
||||
"Biometric Systems/images/Pasted image 20241217030157.png",
|
||||
"Biometric Systems/images/Pasted image 20241212094046.png",
|
||||
|
|
|
@ -40,6 +40,8 @@ $$, dove ~ è la relazione di vicinanza di due nodi nel grafo.
|
|||
|
||||
L'idea è che il modello si può basare sul frame precedente e quello successivo per assegnare la label al frame corrente. Questo lo aiuta a riconoscere meglio i battiti di ciglia, ad esempio se nel frame corrente c'è molto rumore.
|
||||
|
||||
Si usa AdaBoost per riconoscere lo stato di chiusura dell'occhio.
|
||||
|
||||
![[Pasted image 20241113143053.png]]
|
||||
|
||||
#### Määttä - Micro-texture
|
||||
|
|
152
Foundation of data science/notes/9 XGBoost.md
Normal file
152
Foundation of data science/notes/9 XGBoost.md
Normal file
|
@ -0,0 +1,152 @@
|
|||
### **What is XGBoost?**
|
||||
|
||||
XGBoost (**eXtreme Gradient Boosting**) is an optimized and scalable implementation of Gradient Boosting designed for speed and performance. It builds on the classic gradient boosting algorithm with several enhancements, making it one of the most widely used machine learning libraries for structured/tabular data.
|
||||
|
||||
---
|
||||
|
||||
### **Key Differences Between XGBoost and Classic Gradient Boosting**
|
||||
|
||||
|Feature|Classic Gradient Boosting|XGBoost|
|
||||
|---|---|---|
|
||||
|**Regularization**|Basic or none|L1 (Lasso) and L2 (Ridge) regularization for weights to control overfitting.|
|
||||
|**Loss Function**|Standard loss functions (e.g., MSE)|Customizable loss functions with second-order Taylor approximation for faster optimization.|
|
||||
|**Tree Construction**|Level-wise growth (splits all nodes at a given depth)|Leaf-wise growth with depth constraints (reduces loss more efficiently).|
|
||||
|**Parallelism**|Limited|Parallelized tree construction for faster computation.|
|
||||
|**Missing Values**|Must be imputed|Handles missing values internally by learning optimal splits for them.|
|
||||
|**Sparsity Awareness**|Not optimized|Efficiently handles sparse data by skipping zero values.|
|
||||
|**Pruning**|None or minimal|Post-pruning to remove nodes that do not improve the loss significantly.|
|
||||
|**Performance**|Moderate speed and scalability|Highly optimized for speed and memory efficiency, often faster in practice.|
|
||||
|
||||
---
|
||||
|
||||
### **XGBoost Loss Function**
|
||||
|
||||
XGBoost allows users to define a custom loss function, but it relies on second-order Taylor expansion (both gradient and Hessian) to optimize the objective. The general loss function in XGBoost consists of two components:
|
||||
|
||||
L(Θ)=∑i=1nl(yi,y^i)+∑k=1TΩ(hk)L(\Theta) = \sum_{i=1}^n l(y_i, \hat{y}_i) + \sum_{k=1}^T \Omega(h_k)
|
||||
|
||||
1. **First Term: Training Loss (l(yi,y^i)l(y_i, \hat{y}_i))**
|
||||
|
||||
- Measures how well the predictions y^i\hat{y}_i match the true labels yiy_i.
|
||||
- Common choices:
|
||||
- Mean Squared Error (MSE) for regression: l(yi,y^i)=(yi−y^i)2l(y_i, \hat{y}_i) = (y_i - \hat{y}_i)^2
|
||||
- Log Loss for binary classification: l(yi,y^i)=−[yilog(y^i)+(1−yi)log(1−y^i)]l(y_i, \hat{y}_i) = - \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right]
|
||||
- Multiclass Log Loss for multiclass classification.
|
||||
2. **Second Term: Regularization Term (Ω(hk)\Omega(h_k))**
|
||||
|
||||
- Adds penalties for model complexity to avoid overfitting: Ω(hk)=γT+12λ∑jwj2\Omega(h_k) = \gamma T + \frac{1}{2} \lambda \sum_j w_j^2
|
||||
- TT: Number of leaves in the tree.
|
||||
- wjw_j: Weights of the leaves.
|
||||
- γ\gamma: Penalizes additional leaves.
|
||||
- λ\lambda: Penalizes large leaf weights (L2 regularization).
|
||||
|
||||
---
|
||||
|
||||
### **Optimization in XGBoost**
|
||||
|
||||
XGBoost uses a **second-order Taylor approximation** to expand the loss function around the current prediction:
|
||||
|
||||
L(Θ)≈∑i=1n[gihk(xi)+12hihk(xi)2]+Ω(hk)L(\Theta) \approx \sum_{i=1}^n \left[ g_i h_k(x_i) + \frac{1}{2} h_i h_k(x_i)^2 \right] + \Omega(h_k)
|
||||
|
||||
- **Gradient (gig_i):** First derivative of the loss function with respect to predictions.
|
||||
- **Hessian (hih_i):** Second derivative of the loss function with respect to predictions.
|
||||
|
||||
This allows XGBoost to:
|
||||
|
||||
1. Use both gradient and curvature (Hessian) information for more precise optimization.
|
||||
2. Efficiently determine splits and optimize leaf weights during tree construction.
|
||||
|
||||
---
|
||||
|
||||
### **Advantages of XGBoost Over Classic Gradient Boosting**
|
||||
|
||||
1. **Speed:** Parallel computation and optimized algorithms for faster training.
|
||||
2. **Regularization:** L1 and L2 regularization reduce overfitting.
|
||||
3. **Handling Missing Data:** Automatically manages missing values during training.
|
||||
4. **Scalability:** Works efficiently with large datasets and sparse data.
|
||||
5. **Customizability:** Allows custom loss functions and objective tuning.
|
||||
6. **Pruning and Sparsity Awareness:** More efficient model structures.
|
||||
|
||||
XGBoost has become the go-to algorithm in many data science competitions and practical applications due to these advantages.
|
||||
|
||||
In **XGBoost**, the **gradient** and **Hessian** of the loss function are used to update the model efficiently by guiding the optimization process during tree construction. These values provide first- and second-order information about the behavior of the loss function, allowing for more precise updates.
|
||||
|
||||
Here’s a breakdown of how they are used:
|
||||
|
||||
---
|
||||
|
||||
### **1. Gradient and Hessian Computation**
|
||||
|
||||
For a given loss function l(y,y^)l(y, \hat{y}), the **gradient** (gg) and **Hessian** (hh) are computed for each training example:
|
||||
|
||||
- **Gradient (gig_i)**: Measures the direction and magnitude of the steepest ascent in the loss function with respect to the model's prediction:
|
||||
|
||||
gi=∂l(yi,y^i)∂y^ig_i = \frac{\partial l(y_i, \hat{y}_i)}{\partial \hat{y}_i}
|
||||
- **Hessian (hih_i)**: Measures the curvature (second derivative) of the loss function with respect to the model's prediction:
|
||||
|
||||
hi=∂2l(yi,y^i)∂y^i2h_i = \frac{\partial^2 l(y_i, \hat{y}_i)}{\partial \hat{y}_i^2}
|
||||
|
||||
---
|
||||
|
||||
### **2. Tree Splitting Using Gradient and Hessian**
|
||||
|
||||
#### **Split Criterion**
|
||||
|
||||
XGBoost constructs decision trees by finding splits that minimize the loss function. At each split, the gain is calculated using the gradient and Hessian.
|
||||
|
||||
For a given split, the gain is computed as:
|
||||
|
||||
Gain=12[GL2HL+λ+GR2HR+λ−(GL+GR)2HL+HR+λ]−γ\text{Gain} = \frac{1}{2} \left[ \frac{G_L^2}{H_L + \lambda} + \frac{G_R^2}{H_R + \lambda} - \frac{(G_L + G_R)^2}{H_L + H_R + \lambda} \right] - \gamma
|
||||
|
||||
Where:
|
||||
|
||||
- GLG_L, GRG_R: Sum of gradients for the left and right child nodes.
|
||||
- HLH_L, HRH_R: Sum of Hessians for the left and right child nodes.
|
||||
- λ\lambda: L2 regularization parameter (smooths the model).
|
||||
- γ\gamma: Minimum loss reduction required to make a split (controls tree complexity).
|
||||
|
||||
The algorithm selects the split that maximizes the gain.
|
||||
|
||||
---
|
||||
|
||||
### **3. Leaf Weight Optimization**
|
||||
|
||||
Once a tree structure is determined, the weight of each leaf is optimized using both the gradients and Hessians. The optimal weight wjw_j for a leaf jj is calculated as:
|
||||
|
||||
wj=−GjHj+λw_j = -\frac{G_j}{H_j + \lambda}
|
||||
|
||||
Where:
|
||||
|
||||
- GjG_j: Sum of gradients for all examples in the leaf.
|
||||
- HjH_j: Sum of Hessians for all examples in the leaf.
|
||||
- λ\lambda: L2 regularization parameter.
|
||||
|
||||
This weight minimizes the loss for that leaf, balancing model complexity and predictive accuracy.
|
||||
|
||||
---
|
||||
|
||||
### **4. Model Update**
|
||||
|
||||
After computing the optimal splits and leaf weights, the predictions for the dataset are updated:
|
||||
|
||||
y^i(t+1)=y^i(t)+η⋅w(xi)\hat{y}_i^{(t+1)} = \hat{y}_i^{(t)} + \eta \cdot w(x_i)
|
||||
|
||||
Where:
|
||||
|
||||
- y^i(t)\hat{y}_i^{(t)}: Prediction for sample ii at iteration tt.
|
||||
- η\eta: Learning rate (controls step size).
|
||||
- w(xi)w(x_i): Weight of the leaf to which xix_i belongs in the new tree.
|
||||
|
||||
This iterative process improves the model's predictions by reducing the residual errors at each step.
|
||||
|
||||
---
|
||||
|
||||
### **Why Use Gradient and Hessian?**
|
||||
|
||||
1. **Gradient (gg):** Indicates the direction and magnitude of adjustments needed to reduce the loss.
|
||||
2. **Hessian (hh):** Helps adjust for the curvature of the loss function, leading to more precise updates (second-order optimization).
|
||||
|
||||
By leveraging both, XGBoost:
|
||||
|
||||
- Makes more informed splits and weight calculations.
|
||||
- Optimizes the model efficiently while avoiding overfitting.
|
Loading…
Reference in a new issue