-
Notifications
You must be signed in to change notification settings - Fork 505
Open
Description
Reproduce codes and orc-data
orc-version: current main, commit-id 3251a01f56d3f813e4889eb7d75eb2d0d63551f6
Reproduced codes:
#include <cmath>
#include <iostream>
#include <queue>
#include <list>
#include <orc/OrcFile.hh>
#include <orc/Reader.hh>
#include <orc/Type.hh>
#include <orc/Vector.hh>
#include <string>
#include <vector>
#define BATCH_SIZE 1024
// Print the 7's column data in curr_batch
void print_batch(const ORC_UNIQUE_PTR<orc::ColumnVectorBatch>& curr_batch,
const char* msg) {
std::cout << "debug check vector str " << msg << " curr_batch "
<< curr_batch.get() << std::endl;
auto curr_batch_data =
dynamic_cast<orc::StructVectorBatch*>(curr_batch.get());
auto string_batch =
dynamic_cast<orc::StringVectorBatch*>(curr_batch_data->fields[0]);
int64_t total = curr_batch->numElements;
for (int64_t idx = 0; idx < total; idx++) {
int64_t orc_data_len = string_batch->length[idx];
char* orc_data = (char*)string_batch->data[idx];
std::cout << "debug check vector str " << msg << " "
<< std::string(orc_data, orc_data_len) << std::endl;
}
}
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cerr << " invalid args " << std::endl;
return 1;
}
int num_count = std::stoi(argv[1]);
try {
const std::string& filePath = "000000_0";
// reader
auto reader =
orc::createReader(orc::readFile(filePath), orc::ReaderOptions());
auto row_count = reader->getNumberOfRows();
// row_reader
orc::RowReaderOptions rowReaderOptions;
std::list<uint64_t> l;
l.push_back(6);
rowReaderOptions.include(l);
std::unique_ptr<orc::RowReader> rowReader =
reader->createRowReader(rowReaderOptions);
const orc::Type& schema = rowReader->getSelectedType();
// read column batch and save it into vector
std::vector<ORC_UNIQUE_PTR<orc::ColumnVectorBatch>> batch_pool;
std::unique_ptr<orc::ColumnVectorBatch> batch =
rowReader->createRowBatch(BATCH_SIZE);
batch_pool.reserve(row_count / BATCH_SIZE + 1);
int batch_count = 0;
while (rowReader->next(*batch) && batch_count < num_count) {
batch_pool.push_back(std::move(batch));
batch = rowReader->createRowBatch(BATCH_SIZE);
batch_count++;
}
// BUG access the first batch in vector, all the data is invalid
print_batch(batch_pool[0], "after loop");
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}The data is produced by hive with below schema:
CREATE TABLE IF NOT EXISTS customer_text (
c_custkey integer ,
c_name varchar(25) ,
c_address varchar(40) ,
c_nationkey integer ,
c_phone char(15) ,
c_acctbal decimal(15,2) ,
c_mktsegment char(10) ,
c_comment varchar(117)
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|';Data file used in the program: https://github.com/ZhangHuiGui/TEMP/000000_0, it's uploaded by git-lfs.
Execute program with attached orc-data:
./test_orc 440
It's weird that:
- when we save < 440 batches into std::vector, batch_pool[0]'s 6 columns data is normal, when we saved 440's batches, the batch_pool[0]'s 6 columns data is invalid.
- Only
c_mktsegment char(10)has the problem...
$ ./orc_test 440|tail -n 10
debug check vector str after loop
debug check vector str after loop
debug check vector str after loop �n�@
debug check vector str after loop FURN!
debug check vector str after loop FURN!
debug check vector str after loop �n�@
debug check vector str after loop �n�@
debug check vector str after loop
debug check vector str after loop
debug check vector str after loop FURN!
./orc_test 339|tail -n 10
debug check vector str after loop HOUSEHOLD
debug check vector str after loop AUTOMOBILE
debug check vector str after loop MACHINERY
debug check vector str after loop FURNITURE
debug check vector str after loop FURNITURE
debug check vector str after loop MACHINERY
debug check vector str after loop MACHINERY
debug check vector str after loop AUTOMOBILE
debug check vector str after loop AUTOMOBILE
debug check vector str after loop FURNITURESo i'm confused with this problem, am i use the api with a wrong way? any suggestion...
Metadata
Metadata
Assignees
Labels
No labels