|
@@ -19,6 +19,7 @@ import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
|
|
|
import org.elasticsearch.xpack.core.ml.inference.TrainedModelConfig;
|
|
|
import org.elasticsearch.xpack.core.ml.inference.TrainedModelDefinition;
|
|
|
import org.elasticsearch.xpack.core.ml.inference.TrainedModelInput;
|
|
|
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
|
|
|
import org.elasticsearch.xpack.ml.dataframe.DataFrameAnalyticsTask.ProgressTracker;
|
|
|
import org.elasticsearch.xpack.ml.dataframe.process.results.AnalyticsResult;
|
|
|
import org.elasticsearch.xpack.ml.dataframe.process.results.RowResults;
|
|
@@ -37,6 +38,18 @@ public class AnalyticsResultProcessor {
|
|
|
|
|
|
private static final Logger LOGGER = LogManager.getLogger(AnalyticsResultProcessor.class);
|
|
|
|
|
|
+ /**
|
|
|
+ * While we report progress as we read row results there are other things we need to account for
|
|
|
+ * to report completion. There are other types of results we can't predict the number of like
|
|
|
+ * progress objects and the inference model. Thus, we report a max progress until we know we have
|
|
|
+ * completed processing results.
|
|
|
+ *
|
|
|
+ * It is critical to ensure we do not report complete progress too soon as restarting a job
|
|
|
+ * uses the progress to determine which state to restart from. If we report full progress too soon
|
|
|
+ * we cannot restart a job as we will think the job was finished.
|
|
|
+ */
|
|
|
+ private static final int MAX_PROGRESS_BEFORE_COMPLETION = 98;
|
|
|
+
|
|
|
private final DataFrameAnalyticsConfig analytics;
|
|
|
private final DataFrameRowsJoiner dataFrameRowsJoiner;
|
|
|
private final ProgressTracker progressTracker;
|
|
@@ -68,7 +81,7 @@ public class AnalyticsResultProcessor {
|
|
|
completionLatch.await();
|
|
|
} catch (InterruptedException e) {
|
|
|
Thread.currentThread().interrupt();
|
|
|
- LOGGER.error(new ParameterizedMessage("[{}] Interrupted waiting for results processor to complete", analytics.getId()), e);
|
|
|
+ setAndReportFailure(ExceptionsHelper.serverError("interrupted waiting for results processor to complete", e));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -91,27 +104,32 @@ public class AnalyticsResultProcessor {
|
|
|
processResult(result, resultsJoiner);
|
|
|
if (result.getRowResults() != null) {
|
|
|
processedRows++;
|
|
|
- progressTracker.writingResultsPercent.set(processedRows >= totalRows ? 100 : (int) (processedRows * 100.0 / totalRows));
|
|
|
+ updateResultsProgress(processedRows >= totalRows ? 100 : (int) (processedRows * 100.0 / totalRows));
|
|
|
}
|
|
|
}
|
|
|
- if (isCancelled == false) {
|
|
|
- // This means we completed successfully so we need to set the progress to 100.
|
|
|
- // This is because due to skipped rows, it is possible the processed rows will not reach the total rows.
|
|
|
- progressTracker.writingResultsPercent.set(100);
|
|
|
- }
|
|
|
} catch (Exception e) {
|
|
|
if (isCancelled) {
|
|
|
// No need to log error as it's due to stopping
|
|
|
} else {
|
|
|
- LOGGER.error(new ParameterizedMessage("[{}] Error parsing data frame analytics output", analytics.getId()), e);
|
|
|
- failure = "error parsing data frame analytics output: [" + e.getMessage() + "]";
|
|
|
+ setAndReportFailure(e);
|
|
|
}
|
|
|
} finally {
|
|
|
+ if (isCancelled == false && failure == null) {
|
|
|
+ completeResultsProgress();
|
|
|
+ }
|
|
|
completionLatch.countDown();
|
|
|
process.consumeAndCloseOutputStream();
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ private void updateResultsProgress(int progress) {
|
|
|
+ progressTracker.writingResultsPercent.set(Math.min(progress, MAX_PROGRESS_BEFORE_COMPLETION));
|
|
|
+ }
|
|
|
+
|
|
|
+ private void completeResultsProgress() {
|
|
|
+ progressTracker.writingResultsPercent.set(100);
|
|
|
+ }
|
|
|
+
|
|
|
private void processResult(AnalyticsResult result, DataFrameRowsJoiner resultsJoiner) {
|
|
|
RowResults rowResults = result.getRowResults();
|
|
|
if (rowResults != null) {
|
|
@@ -137,7 +155,7 @@ public class AnalyticsResultProcessor {
|
|
|
}
|
|
|
} catch (InterruptedException e) {
|
|
|
Thread.currentThread().interrupt();
|
|
|
- LOGGER.error(new ParameterizedMessage("[{}] Interrupted waiting for inference model to be stored", analytics.getId()), e);
|
|
|
+ setAndReportFailure(ExceptionsHelper.serverError("interrupted waiting for inference model to be stored"));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -168,19 +186,22 @@ public class AnalyticsResultProcessor {
|
|
|
aBoolean -> {
|
|
|
if (aBoolean == false) {
|
|
|
LOGGER.error("[{}] Storing trained model responded false", analytics.getId());
|
|
|
+ setAndReportFailure(ExceptionsHelper.serverError("storing trained model responded false"));
|
|
|
} else {
|
|
|
LOGGER.info("[{}] Stored trained model with id [{}]", analytics.getId(), trainedModelConfig.getModelId());
|
|
|
auditor.info(analytics.getId(), "Stored trained model with id [" + trainedModelConfig.getModelId() + "]");
|
|
|
}
|
|
|
},
|
|
|
- e -> {
|
|
|
- LOGGER.error(new ParameterizedMessage("[{}] Error storing trained model [{}]", analytics.getId(),
|
|
|
- trainedModelConfig.getModelId()), e);
|
|
|
- auditor.error(analytics.getId(), "Error storing trained model with id [" + trainedModelConfig.getModelId()
|
|
|
- + "]; error message [" + e.getMessage() + "]");
|
|
|
- }
|
|
|
+ e -> setAndReportFailure(ExceptionsHelper.serverError("error storing trained model with id [{}]", e,
|
|
|
+ trainedModelConfig.getModelId()))
|
|
|
);
|
|
|
trainedModelProvider.storeTrainedModel(trainedModelConfig, new LatchedActionListener<>(storeListener, latch));
|
|
|
return latch;
|
|
|
}
|
|
|
+
|
|
|
+ private void setAndReportFailure(Exception e) {
|
|
|
+ LOGGER.error(new ParameterizedMessage("[{}] Error processing results; ", analytics.getId()), e);
|
|
|
+ failure = "error processing results; " + e.getMessage();
|
|
|
+ auditor.error(analytics.getId(), "Error processing results; " + e.getMessage());
|
|
|
+ }
|
|
|
}
|