Question

(I asked multiple questions to the implementation of the huffman code here, if I ask too many questions please tell me and I ll stop. Any tips on how to learn to find those mistakes myself are appreciated, I spent hours and couldn t find the problem.)
I try to implement huffman code in C. Right now I can successfully read a file, generate the tree and write the compressed data to the output file. However if my input file is random generated (using something like dd if=/dev/random of=foo bs=100 count=1) the program aborts with a segfault. The weird thing is, this only happens on some random files, some others work. I m confident that the null character isn t the problem because I don t use any str*() functions (I know printTree() does rely on but I don t use the function), also one random file that I generated contained an empty byte and it worked. One more time I am profoundly confused as to why my code doesn t work, more specific why it doesn t treat all input equally (I would have expected that my code either fails all the time or never as long as the input has the same length).

huffman. c :

#include "huffman.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct node {
    node* left;
    node* right;
    double weight;
    char* bytes;
    size_t nmemb;
};

void printTree(struct node* root, int depth)
{
    if (root == NULL) return;

    for (int i = 0; i < depth; i++) {
        printf(" │ ");
    }
    printf("(%s: %.2f)
", root->bytes, root->weight);

    printTree(root->left, depth + 1);
    printTree(root->right, depth + 1);
}

// Order nodes in array, smallest weight last
static int compareFrequency(const void* a, const void* b)
{
    if ((*(node**)a)->weight < (*(node**)b)->weight) return 1;
    if ((*(node**)a)->weight > (*(node**)b)->weight) return -1;
    if ((*(node**)a)->nmemb < (*(node**)b)->nmemb) return 1;
    if ((*(node**)a)->nmemb > (*(node**)b)->nmemb) return -1;
    return 1;
}

// Encode input to output
void encode(FILE* input, FILE* output)
{
    unsigned int possibleBytes[256] = { 0 }, differentBytes = 0, i;
    unsigned long long totalBytes = 0;
    char currChar;

    // Count number of occurence of each individual byte
    while ((currChar = fgetc(input)) != EOF) {
        possibleBytes[(int)currChar]++;
        totalBytes++;
    }

    // Count number of unique bytes
    for (i = 0; i < 256; i++) {
        if (possibleBytes[i]) differentBytes++;
    }

    // Make array filled with nodes (at the moment only the used bytes)
    node* nodes = calloc(differentBytes * 2 - 1, sizeof(node));
    node* nodesP = nodes;

    for (i = 0; i < 256; i++) {
        if (possibleBytes[i]) {
            nodesP->bytes = calloc(1, sizeof(char));
            nodesP->bytes[0] = i;
            nodesP->weight = (double)possibleBytes[i] / totalBytes;
            nodesP->nmemb = 1;
            nodesP++;
        }
    }

    // Fill trees array with nodes
    node** trees = calloc(differentBytes, sizeof(node*));
    node** treesP = trees;
    nodesP = nodes;
    unsigned int treesLeft = differentBytes;

    for (i = 0; i < treesLeft; i++) {
        *treesP = nodesP;
        treesP++;
        nodesP++;
    }

    // Build tree
    node* lastTree;
    node* secondLastTree;

    while (treesLeft > 1) {
        qsort(trees, treesLeft, sizeof(node*), &compareFrequency);

        lastTree = *(trees + treesLeft - 1);
        secondLastTree = *(trees + treesLeft - 2);

        nodesP->left = lastTree;
        nodesP->right = secondLastTree;
        nodesP->weight = lastTree->weight + secondLastTree->weight;
        nodesP->nmemb = lastTree->nmemb + secondLastTree->nmemb;
        nodesP->bytes = calloc(lastTree->nmemb + secondLastTree->nmemb, sizeof(char));
        // str* functions use  terminated string -> Might not work if array contains  (not at
        // the end)
        // strcat(nodesP->bytes, lastTree->bytes);
        // strcat(nodesP->bytes, secondLastTree->bytes);
        memcpy(nodesP->bytes, lastTree->bytes, lastTree->nmemb);
        memcpy(nodesP->bytes + lastTree->nmemb, secondLastTree->bytes, secondLastTree->nmemb);
        trees[treesLeft - 2] = nodesP;

        treesLeft--;
        nodesP++;
    }

    // write tree to file
    // https://stackoverflow.com/a/759766/15833045
    //...

    // write data to file
    // left is 0, right is 1
    rewind(input);
    unsigned char bitPosition = 1;
    unsigned char buffer[BUFSIZ];
    node* currNode = *trees;
    currChar = fgetc(input);

    for (i = 0; currChar != EOF; i++) {
        // Fill byte of buffer
        while (bitPosition <= 8) {
            // Set 0 if going left, set 1 if going right
            if (memchr(currNode->left->bytes, currChar, currNode->nmemb)) {
                buffer[i] &= ~(1 << (8 - bitPosition));
                currNode = currNode->left;
            } else {
                buffer[i] |= (1 << (8 - bitPosition));
                currNode = currNode->right;
            }

            // If leaf, reset currNode and get next byte
            if (!currNode->left) {
                currNode = *trees;
                currChar = fgetc(input);
                if (currChar == EOF) {
                    break;
                }
            }
            bitPosition++;
        }
        bitPosition = 1;

        if (i == BUFSIZ - 1) {
            i = 0;
            fwrite(buffer, sizeof(char), BUFSIZ, output);
        }
    }

    fwrite(buffer, sizeof(char), i, output);
}

主要.c:

#include "huffman.h"
#include <stdio.h>

int main(int argc, char* argv[])
{
    FILE* inputP = fopen(argv[1], "rb");
    FILE* outputP = fopen(argv[2], "w");

    if (!(inputP && outputP)) return 1;

    encode(inputP, outputP);

    fclose(inputP);
    fclose(outputP);
    return 0;
}

制作文件 :

CC = gcc
CFLAGS = -g
OBJ = main.o huffman.o

huffman: $(OBJ)
    $(CC) $(OBJ) -o $@

%.o: %.c
    $(CC) $(CFLAGS) -c $<


clean:
    rm *.o

gdb 输出( 当使用不同的随机文件并因此使用不同的核心倾弃处时略有不同) :

Reading symbols from huffman...
[New LWP 9843]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
Core was generated by `huffman a b .
Program terminated with signal SIGSEGV, Segmentation fault.
#0  __memchr_avx2 () at ../sysdeps/x86_64/multiarch/memchr-avx2.S:82
82              VPCMPEQ (%rdi), %ymm0, %ymm1

Answer 1

currChar 必须是 int 。这将将读取的字符设置为 0. 255 而不是 - 128.1. 127。然后负值, 其中至少有一个会以100个随机字节中找到 < supp/ sup>, 将导致 < code> 可能的Bytes[] 的出入境访问。

您的 memchr( curr node- & gt; left- gt; left- gt; bytes, currChar, currNode- gt; nmemb) 正在使用 currnode 上的字节数, 这比 currNode- gt; left 上的字节数要多。最后一个参数必须是 currnode- gt; currnode- gt; lft- gt; nmemb 。

一些评论:

You need a typedef struct node node; at the start for it to compile at all.
All of the casts in compareFrequency() need to be node * const * to correspond to the const void * arguments.
You need to return 0; at the end of compareFrequency(), for it to do what it claims.
You should use integers, in particular long long s, for the weights. Not double s. You don t need probabilities, just frequencies. Don t divide by the total.
Writing the tree to the file is easy. Traverse the tree recursively, and for every branch write a 0 bit, and for every leaf write 1 bit followed by the eight bits of the character at that leaf.
Instead of keeping lists of all of the bytes at each branch for searching when encoding, you should simply make a table of Huffman codes by traversing the tree once. (Which can be the same time you traverse the tree to write it out.) Then just look up the character in the table for encoding.
Your i = 0; at the end of the buffer is followed by the i++ of the for loop, so you start the next round at i equal to one, instead of the intended zero.
You need to think about what to do if differentBytes is one. What is the length of the code? You will also need to think about how to handle the last few bits in the last byte, and keep them from decoding into an extraneous character. There is exists one solution to both of those problems.

*:概率1-2 ^-100。

友情链接