package org.apache.hudi.utilities.sources.helpers;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import org.apache.hudi.common.config.HoodieStorageConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.ConfigUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.config.CloudSourceConfig;
import org.apache.hudi.utilities.config.HoodieIncrSourceConfig;
import org.apache.hudi.utilities.ingestion.HoodieIngestionMetrics;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hudi.utilities.sources.helpers.CloudObjectsSelectorCommon;
import org.apache.hudi.utilities.streamer.SourceProfileSupplier;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/hudi/utilities/sources/helpers/CloudDataFetcher.class */
public class CloudDataFetcher implements Serializable {
    private static final String EMPTY_STRING = "";
    private transient TypedProperties props;
    private transient JavaSparkContext sparkContext;
    private transient SparkSession sparkSession;
    private transient CloudObjectsSelectorCommon cloudObjectsSelectorCommon;
    private static final Logger LOG = LoggerFactory.getLogger(CloudDataFetcher.class);
    private static final long serialVersionUID = 1;
    private final HoodieIngestionMetrics metrics;

    public CloudDataFetcher(TypedProperties typedProperties, JavaSparkContext javaSparkContext, SparkSession sparkSession, HoodieIngestionMetrics hoodieIngestionMetrics) {
        this(typedProperties, javaSparkContext, sparkSession, hoodieIngestionMetrics, new CloudObjectsSelectorCommon(typedProperties));
    }

    public CloudDataFetcher(TypedProperties typedProperties, JavaSparkContext javaSparkContext, SparkSession sparkSession, HoodieIngestionMetrics hoodieIngestionMetrics, CloudObjectsSelectorCommon cloudObjectsSelectorCommon) {
        this.props = typedProperties;
        this.sparkContext = javaSparkContext;
        this.sparkSession = sparkSession;
        this.metrics = hoodieIngestionMetrics;
        this.cloudObjectsSelectorCommon = cloudObjectsSelectorCommon;
    }

    public static String getFileFormat(TypedProperties typedProperties) {
        return StringUtils.isNullOrEmpty(ConfigUtils.getStringWithAltKeys(typedProperties, CloudSourceConfig.DATAFILE_FORMAT, "")) ? ConfigUtils.getStringWithAltKeys(typedProperties, HoodieIncrSourceConfig.SOURCE_FILE_FORMAT, true) : ConfigUtils.getStringWithAltKeys(typedProperties, CloudSourceConfig.DATAFILE_FORMAT, "");
    }

    public Pair<Option<Dataset<Row>>, String> fetchPartitionedSource(CloudObjectsSelectorCommon.Type type, CloudObjectIncrCheckpoint cloudObjectIncrCheckpoint, Option<SourceProfileSupplier> option, Pair<QueryInfo, Dataset<Row>> pair, Option<SchemaProvider> option2, long j) {
        boolean z = option.isPresent() && option.get().getSourceProfile() != null;
        if (z) {
            LOG.debug("Using source limit from source profile sourceLimitFromConfig {} sourceLimitFromProfile {}", Long.valueOf(j), Long.valueOf(option.get().getSourceProfile().getMaxSourceBytes()));
            j = option.get().getSourceProfile().getMaxSourceBytes();
        }
        QueryInfo left = pair.getLeft();
        String generateFilter = CloudObjectsSelectorCommon.generateFilter(type, this.props);
        LOG.info("Adding filter string to Dataset: " + generateFilter);
        Dataset filter = pair.getRight().filter(generateFilter);
        LOG.info("Adjusting end checkpoint:" + left.getEndInstant() + " based on sourceLimit :" + j);
        Pair<CloudObjectIncrCheckpoint, Option<Dataset<Row>>> filterAndGenerateCheckpointBasedOnSourceLimit = IncrSourceHelper.filterAndGenerateCheckpointBasedOnSourceLimit(filter, j, left, cloudObjectIncrCheckpoint);
        if (!filterAndGenerateCheckpointBasedOnSourceLimit.getRight().isPresent()) {
            LOG.info("Empty source, returning endpoint:" + filterAndGenerateCheckpointBasedOnSourceLimit.getLeft());
            return Pair.of(Option.empty(), filterAndGenerateCheckpointBasedOnSourceLimit.getLeft().toString());
        }
        LOG.info("Adjusted end checkpoint :" + filterAndGenerateCheckpointBasedOnSourceLimit.getLeft());
        List<CloudObjectMetadata> objectMetadata = CloudObjectsSelectorCommon.getObjectMetadata(type, this.sparkContext, filterAndGenerateCheckpointBasedOnSourceLimit.getRight().get(), ConfigUtils.getBooleanWithAltKeys(this.props, CloudSourceConfig.ENABLE_EXISTS_CHECK), this.props);
        LOG.info("Total number of files to process :" + objectMetadata.size());
        long j2 = this.props.containsKey(CloudSourceConfig.SOURCE_MAX_BYTES_PER_PARTITION.key()) ? this.props.getLong(CloudSourceConfig.SOURCE_MAX_BYTES_PER_PARTITION.key()) : this.props.getLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), Long.parseLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.defaultValue()));
        if (z) {
            long longValue = ((Long) option.get().getSourceProfile().getSourceSpecificContext()).longValue();
            if (longValue > 0) {
                LOG.debug("Using bytesPerPartition from source profile bytesPerPartitionFromConfig {} bytesPerPartitionFromProfile {}", Long.valueOf(j2), Long.valueOf(longValue));
                j2 = longValue;
            }
        }
        return Pair.of(getCloudObjectDataDF(objectMetadata, option2, j2), filterAndGenerateCheckpointBasedOnSourceLimit.getLeft().toString());
    }

    private Option<Dataset<Row>> getCloudObjectDataDF(List<CloudObjectMetadata> list, Option<SchemaProvider> option, long j) {
        long j2 = 0;
        Iterator<CloudObjectMetadata> it = list.iterator();
        while (it.hasNext()) {
            j2 += it.next().getSize();
        }
        this.metrics.updateStreamerSourceBytesToBeIngestedInSyncRound(j2);
        int max = (int) Math.max(Math.ceil((j2 * 1.1d) / j), 1.0d);
        this.metrics.updateStreamerSourceParallelism(max);
        return this.cloudObjectsSelectorCommon.loadAsDataset(this.sparkSession, list, getFileFormat(this.props), option, max);
    }
}
