/*
 * podzial.c - fast and simple interface to morfologik
 * Copyright (C) Bohdan R. Rau 2012-2014 <ethanak@polip.com>
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program.  If not, write to:
 * 	The Free Software Foundation, Inc.,
 * 	51 Franklin Street, Fifth Floor
 * 	Boston, MA  02110-1301, USA.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <glib.h>
#include <ctype.h>

#define WT_SHIFT 58
#define WT_MASK (31LL << WT_SHIFT)

#define WT_GET(a) (((a) >> WT_SHIFT) & 31)
#define WT_SET(a,t) (((a) & ~(31LL << WT_SHIFT)) | (((u_int64_t)(t)) << WT_SHIFT))

#define WT_adj  1
#define WT_adjp  2
#define WT_adv  3
#define WT_conj 4
#define WT_num  5
#define WT_pact 6
#define WT_pant 7
#define WT_pcon 8
#define WT_ppas 9
#define WT_ppron12 10
#define WT_ppron3 11
#define WT_pred 12
#define WT_adjc 13
#define WT_siebie 14
#define WT_subst 15
#define WT_verb 16
#define WT_brev 17
#define WT_interj 18
#define WT_xxx 19
#define WT_nie 20
#define WT_advp 21
#define WT_prep 22
#define WT_comp 23

char *wt_markers[]={
    "adj",    "adjp",    "adv",    "conj",
    "num",    "pact",    "pant",    "pcon",
    "ppas",    "ppron12",    "ppron3",    "pred",
    "adjc",    "siebie",    "subst",    "verb",
    "brev",    "interj",    "xxx",    "nie",
    "advp",	"prep",	"comp",
    NULL
};



#define WM_sg   (1LL<<0)
#define WM_pl   (1LL<<1)

#define WM_NUM_MASK (3LL)

#define WM_pred (1LL<<2)

#define WM_nom (1LL<<3)
#define WM_gen (1LL<<4)
#define WM_dat (1LL<<5)
#define WM_acc (1LL<<6)
#define WM_inst (1LL<<7)
#define WM_loc (1LL<<8)
#define WM_voc (1LL<<9)

#define WM_CASU_MASK (127LL<<3)

#define WM_pos (1LL<<10)
#define WM_comp (1LL<<11)
#define WM_sup (1LL<<12)

#define WM_GRAD_MASK (7LL << 10)

#define WM_m1 (1LL<<13)
#define WM_m2 (1LL<<14)
#define WM_m3 (1LL<<15)
//#define WM_m (1LL<<16)
#define WM_f (1LL<<16)
#define WM_n1 (1LL<<17)
#define WM_n2 (1LL<<18)
//#define WM_n (1LL<<20)
#define WM_p1 (1LL<<19)
#define WM_p2 (1LL<<20)
#define WM_p3 (1LL<<21)
//#define WM_p (1LL<<24)

#define WM_GENR_MASK (511LL << 13)

#define WM_pri (1LL<<22)
#define WM_sec (1LL<<23)
#define WM_tri (1LL<<24)

#define WM_PERS_MASK (7LL << 22)

#define WM_aff (1LL<<25)
#define WM_neg (1LL<<26)
#define WM_perf (1LL<<27)
#define WM_imperf (1LL<<28)
#define WM_nakc (1LL<<29)
#define WM_akc (1LL<<30)
#define WM_praep (1LL<<31)
#define WM_npraep (1LL<<32)
#define WM_imps (1LL<<33)
#define WM_impt (1LL<<34)
#define WM_inf (1LL<<35)
#define WM_fin (1LL<<36)
#define WM_praet (1LL<<37)
#define WM_pot (1LL<<38)
#define WM_nstd (1LL<<39)
#define WM_pun (1LL<<40)
#define WM_npun (1LL<<41)
#define WM_rec (1LL<<42)
#define WM_congr (1LL<<43)

#define WM_winien (1LL<<44)
#define WM_bedzie (1LL<<45)

#define WM_refl (1LL<<46)
#define WM_nonrefl  (1LL<<47)
#define WM_depr (1LL<<48)
#define WM_vulgar (1LL<<49)
#define WM_illegal (1LL<<50)
#define WM_ger (1LL << 51)
#define WM_wok (1LL << 52)
#define WM_nwok (1LL << 53)
#define WM_cplx (1LL << 54)
#define WM_str3 WM_akc
#define WM_str4 WM_wok

struct {
    char *name;
    u_int64_t bits;
} wm_mark[]=
{
    {"sg", WM_sg},
    {"pl", WM_pl},
    {"pred", WM_pred},
    {"nom", WM_nom},
    {"gen", WM_gen},
    {"acc", WM_acc},
    {"dat", WM_dat},
    {"inst", WM_inst},
    {"loc", WM_loc},
    {"voc", WM_voc},
    {"pos", WM_pos},
    {"comp", WM_comp},
    {"com", WM_comp},
    {"sup", WM_sup},
    {"m1", WM_m1},
    {"m2", WM_m2},
    {"m3", WM_m3},
    {"m", WM_m1 | WM_m2 | WM_m3},
    {"n1", WM_n1},
    {"n2", WM_n2},
    {"n", WM_n1 | WM_n2},
    {"f", WM_f},
    {"p1", WM_p1},
    {"p2", WM_p2},
    {"p3", WM_p3},
    {"p", WM_p1 | WM_p2 | WM_p3},
    {"pri", WM_pri},
    {"sec", WM_sec},
    {"tri", WM_tri},
    {"ter", WM_tri},
    {"aff", WM_aff},
    {"neg", WM_neg},
    {"refl", WM_refl},
    {"perf", WM_perf},
    {"imperf", WM_imperf},
    {"nakc", WM_nakc},
    {"akc", WM_akc},
    {"praep", WM_praep},
    {"npraep", WM_npraep},
    {"ger", WM_ger},
    {"imps", WM_imps},
    {"impt", WM_impt},
    {"inf", WM_inf},
    {"fin", WM_fin},
    {"bedzie", WM_bedzie},
    {"praet", WM_praet},
    {"pot", WM_pot},
    {"nstd", WM_nstd},
    {"pun", WM_pun},
    {"npun", WM_npun},
    {"rec", WM_rec},
    {"congr", WM_congr},
    {"winien", WM_winien},
    {"depr", WM_depr},
    {"vulgar", WM_vulgar},
    {"ill",WM_illegal},
    {"nonrefl",WM_nonrefl},
    {"wok",WM_wok},
    {"nwok",WM_nwok},
    {"cplx",WM_cplx},
    {"str3",WM_str3},
    {"str4",WM_str4},
    {NULL,0}
};


u_int64_t parse_grama(char *c)
{
    char *gm=strchr(c,':'),*d;
    int i;
    u_int64_t grama=0;
    //fprintf(stderr,"%s\n",c);
    if (gm) *gm++=0;
    if (!strcmp(c,"depr")) {
        c="subst";grama |= WM_depr;
    }
    else if (!strcmp(c,"ger")) {
        c="subst";grama |= WM_ger;
    }
    
    
    for (i=0;wt_markers[i];i++) {
        if (!strcmp(wt_markers[i],c)) break;
    }
    if (!wt_markers) {
        fprintf(stderr,"Unknown grama %s\n",c);
        return 0;
    }
    grama = WT_SET(grama,i+1);
    for (;gm && *gm;gm=d) {
        d=strpbrk(gm,":.");
        if (d) *d++=0;
        for (i=0;wm_mark[i].name;i++) {
            if (!strcmp(gm,wm_mark[i].name)) break;
        }
        if (!wm_mark[i].name) {
            fprintf(stderr,"Bad grama [%s]\n",gm);
        }
        else {
            grama  |= wm_mark[i].bits;
        }
    }
    return grama;
}

/*
 nstd są dwa: "domie" i "se"
 "ni" - błąd, wywalić subst i zamiast tego wrzucić "nie"

qub, burk - nie uwzględniam

*/

char *objmem;

#define HALLOC_SIZE (32 * 1024 * 1024)
void *halloc(int size,int forstring)
{
    //return malloc(size);
    static char *objmem=NULL;
    static int obj_size=0,str_end=0;
    char *rc;
    
    if (!forstring) {
        size=(size + 15) & 0xfff0;
    }
    if (!size) return NULL;
    if (size > 1024) return malloc(size);
    if (obj_size + size > str_end) {
        obj_size=0;
        str_end=HALLOC_SIZE;
        objmem=malloc(HALLOC_SIZE);
    }
    
    if (forstring) {
        str_end -= size;
        return (void *)(objmem + str_end);
    }
    rc=objmem+obj_size;
    obj_size += size;
    return (void *)rc;
}

char *hdup(char *s)
{
    char *c=halloc(strlen(s)+1,1);
    strcpy(c,s);
    return c;
}

int tlw(char *c,char *out)
{
    int i,znak;
    for (i=0;*c;i++) {
        znak=g_utf8_get_char(c);
        if (!znak) break;
        if (i >= 31) return -1;
        c=g_utf8_next_char(c);
        znak=g_unichar_tolower(znak);
        int n=g_unichar_to_utf8(znak,out);
        out+=n;
    }
    *out=0;
    return 1;
}


/*
Czytanie tabeli:
a) wczytanie całości
b) potasowanie
c) po kolei wszystkie refle na koniec

*/

char *morf_body;
int morf_size;
struct linein {
    char *word;
    char *base;
    char *grama;
} *lines;


int lineno;


int good_grama(char *grama,char *word)
{
    char buf[8192];
    char *c,*d,*v;
    int i;
    if (strlen(grama)>8191) {
	fprintf(stderr,"Grama too long: %s\n",grama);
	exit(1);
    }
    strcpy(buf,grama);
    for (c=d=buf;c && *c;d=c) {
        c=strchr(d,'+');
        if (c) *c++=0;
        v=strchr(d,':');
        if (v) *v=0;
        if (!strcmp(d,"refl") || !strcmp(d,"ger") || !strcmp(d,"depr")) return 1;
        if (!strcmp(word,"ni") && !strcmp(buf,"subst")) continue;
        for (i=0;wt_markers[i];i++) {
            if (!strcmp(d,wt_markers[i])) return 1;
        }
        
    }
    return 0;
}



/*
Rzeczowniki pisane wielką literą (nie skrótowce) dostają dodatkowy marker WM_super?
*/


char *string_mem;
int string_offset;
int string_size;

struct work_string {
    struct work_string *l,*r;
    int offset;
} *__ws;

int get_string(char *c)
{
    
    // zwraca offset stringu, tworzy nowy jeśli go nie było
    struct work_string **ws;
    ws=&__ws;
    while (*ws) {
        int n=strcmp(c,string_mem+(*ws)->offset);
        if (!n) return (*ws)->offset;
        if (n<0) ws = &(*ws)->l;
        else ws= &(*ws)->r;
    }
    if (!string_mem) {
        string_size=10000000;
        string_mem=malloc(string_size);
    }
    int l=strlen(c)+1;
    while (string_offset+l > string_size) {
        string_size=string_size + string_size/3;
        string_mem=realloc(string_mem,string_size);
        
    }
    *ws=halloc(sizeof(struct work_string),0);
    (*ws)->l=(*ws)->r=NULL;
    (*ws)->offset=string_offset;
    strcpy(string_mem+string_offset,c);
    string_offset += l;
    return (*ws)->offset;
}


struct work_word {
    struct work_word *l,*r,*n;
    int word; // offset do słowa małymi literami
    int baseword; // offset do słowa formy bazowej
    int pisownia; // offset do pisowni jeśli różna lub to samo co word
    int real_offset; // offset w tabeli words
    struct work_base *bw;
    int baseword_no;
    u_int64_t grama;
    
} *main_words;
int nstru_main,nstru_word;

struct work_base_chain {
    struct work_base_chain *next;
    struct work_word *word;
};
struct work_base {
    struct work_base *l,*r;
    struct work_base_chain *chain;
    int string;
    int count;
    int offset;
} *work_base;

int base_count;

struct work_base *__add_base(char *base,struct work_word *word)
{
    struct work_base **wb=&work_base;
    char buf[8192];
    if (!tlw(base,buf)) {
	fprintf(stderr,"Bad tlw on %s\n",base);
	exit(1);
    }
    while (*wb) {
        char *cs=string_mem+(*wb)->string;
        int n=strcmp(buf,cs);
        if (!n) break;
        if (n < 0) wb = &(*wb)->l;
        else wb = & (*wb) ->r;
    }
    if (!*wb) {
        base_count++;
        *wb=halloc(sizeof(struct work_base),0);
        (*wb)->l = (*wb)->r=NULL;
        (*wb)->chain=NULL;
        (*wb)->string=get_string(buf);
        (*wb)->count=0;
    }
    struct work_base_chain *ws=halloc(sizeof(struct work_base_chain),0);
    ws->next=(*wb)->chain;
    (*wb)->chain=ws;
    (*wb)->count += 1;
    ws->word=word;
    return *wb;
}

int is_vulgar(char *word);

/*
 * str3 dla verbs, jeśli:
 * śmy, ście (ale nie 'by')
 * czyli:
 * 	pl & (pri | sec) & praet & ~pot
 * 	sg & ~praet & pot
 * 	pl & tri & ~praet & pot
 * * str4, dla pl i byśmybyście
*/

static u_int64_t __regram(u_int64_t grama,char *word)
{
    if (WT_GET(grama) != WT_verb) return grama;
    if (grama & WM_impt) {
	char *c=word+strlen(word)-3;
	if (!strcmp(c,"że")) {
		return grama | WM_str3;
	}
	return grama;
    }
    else if (grama & WM_pl) {
	if (grama & (WM_pri | WM_sec)) {
	    if (grama & WM_pot) return grama | WM_str4;
	    if (grama & WM_praet) return grama | WM_str3;
	}
	else if (grama & WM_tri) {
	    if ((grama & (WM_praet | WM_pot)) == WM_pot) return grama | WM_str3;
	}
	return grama;
    }
    else if (grama & WM_sg) {
	if ((grama & (WM_praet | WM_pot)) == WM_pot) return grama | WM_str3;
    }
    return grama;
}


void __insert_gword(char *word,char *base,u_int64_t grama)
{
    char lobuf[256];
    struct work_word **ww;
    if (is_vulgar(base)) {
        //printf("V %s\n",base);
        grama |= WM_vulgar;
    }
    tlw(word,lobuf);
    ww=&main_words;
    grama=__regram(grama,word);
/*
    if (WT_GET(grama) == WT_verb)) {
	if (grama & WM_impt) {
	    char *c=word+strlen(word)-3;
	    if (!strcmp(c,"że")) {
		grama |= WM_str3;
	    }
	}
*/	
    while (*ww) {
        char *cs=string_mem+(*ww)->word;
        int n=strcmp(lobuf,cs);
        if (!n) break;
        if (n < 0) ww = &(*ww)->l;
        else ww = & (*ww) ->r;
    }
    int _word=get_string(lobuf);
    int _base=get_string(base);
    int _piss=get_string(word);
    if (*ww) nstru_main++;
    for (;*ww;ww=&(*ww)->n) {
        if ((*ww)->word != _word || (*ww)->baseword !=_base || (*ww)->pisownia != _piss) {
            continue;
        }
        if ((*ww)->grama == grama) return;
    }
    nstru_word += 1;
    *ww=halloc(sizeof(struct work_word),0);
    memset((*ww),0,sizeof(struct work_word));
    (*ww)->word = _word;
    (*ww)->baseword=_base;
    (*ww)->pisownia=_piss;
    (*ww)->grama=grama;
//    (*ww)->bw=__add_base(base,*ww);
}

struct work_word *find_word(char *word)
{
    char lobuf[256];
    struct work_word *ww;
    tlw(word,lobuf);
    ww=main_words;
    while (ww) {
        char *cs=string_mem+ww->word;
        int n=strcmp(lobuf,cs);
        if (!n) break;
        if (n < 0) ww = ww->l;
        else ww = ww ->r;
    }
    return ww;
}

void insert_word(char *word,char *base,char *grama)
{
    if (!strcmp(grama,"refl")) {
        struct work_word *w=find_word(word);
        if (!w) printf("No word [%s]\n",word);
        int k=0;
        for (;w;w=w->n) {
            if (WT_GET(w->grama) == WT_verb && !strcmp(base,string_mem + w->baseword)) {
                w->grama |= WM_refl;
                k+=1;
            }
        }
        if (!k) printf("No verb [%s]\n",word);
        return;
    }
    char *c;
    char gbuf[8192];
    if (strlen(grama)>8191) {
	fprintf(stderr,"Grama too long %s\n",grama);
	exit(1);
    }
    strcpy(gbuf,grama);
    grama=gbuf;
    for (;grama && *grama;grama=c) {
        c=strpbrk(grama,"+|");
        if (c) *c++=0;
	//fprintf(stderr,"%s %s %s\n",word,base,grama);
        u_int64_t gm=parse_grama(grama);
        if (gm) __insert_gword(word,base,gm);
    }
}

int same_grama(u_int64_t g1,u_int64_t g2,u_int64_t mask)
{
    if (WT_GET(g1) != WT_GET(g2)) {
        return 0;
    }
    //if (WT_GET(g1) != WT_subst) return 0;
    g1= g1 & ~(mask);
    g2= g2 & ~(mask);
    return g1 == g2;
}
int simply;
void simplify_grama(struct work_word *word,u_int64_t mask)
{
    struct work_word **ww;
    for (;word;word=word->n) {
        for (ww=&(word->n);*ww;) {
            if (word->baseword == (*ww)->baseword &&
                word->pisownia == (*ww)->pisownia &&
                same_grama(word->grama,(*ww)->grama,mask)) {
                word->grama |= (*ww)->grama;
                simply++;
                *ww=(*ww)->n;
            }
            else {
                ww=&(*ww)->n;
            }
        }
        
    }
}

void simplify_grama_tree(struct work_word *word)
{
    while (word) {
        if (word->l) simplify_grama_tree(word->l);
        simplify_grama(word,WM_CASU_MASK);
        simplify_grama(word,WM_GENR_MASK);
        word=word->r;
    }
    
}

void count_all_words(struct work_word *word)
{
    struct work_word *w;
    while (word) {
        if (word->l) count_all_words(word->l);
        for (w=word;w;w=w->n) {
            w->bw=__add_base(string_mem+w->baseword,w);
            nstru_word++;
        }
        word=word->r;
    }
}
struct real_word {
    u_int32_t name;
    u_int32_t writename;
    u_int32_t basename;
    u_int32_t baseword;
    u_int32_t next_word;
    u_int32_t dummy;
    u_int64_t grama;
} *real_words;

int real_words_offset;
int real_words_sec_offset;
int real_words_count;

void init_real_word(struct work_word *word)
{
    int otype=0;
    int nrw;
    int *into=NULL;
    for (;word;word=word->n,otype=1) {
        if (!otype) nrw=real_words_offset++;
        else nrw = --real_words_sec_offset;
        word->real_offset=nrw;
        real_words[nrw].name=word->word;
        real_words[nrw].basename=word->baseword;
        real_words[nrw].writename=word->pisownia;
        real_words[nrw].grama=word->grama;
        if (into) *into=nrw;
        into=&real_words[nrw].next_word;
        real_words[nrw].next_word=-1;
    }
}

void init_real_words(struct work_word *word)
{
    while (word) {
        if (word->l) init_real_words(word->l);
        init_real_word(word);
        word=word->r;
    }
}


int base_vector_count;
int *base_vector;
int base_vector_pos;

void compute_base_vector(struct work_base *base)
{
    while (base) {
        if (base->l) compute_base_vector(base->l);
        base_vector_count += base->count+1;
        base=base->r;
    }
}

struct real_base {
    int name;
    int vector;
} *real_base;
int real_base_offset;

void create_base_vector(struct work_base *base)
{
    
    void clb(struct work_base *base) {
        struct work_base_chain *wc;
        real_base[real_base_offset].name=base->string;
        real_base[real_base_offset].vector=base_vector_pos;
        base->offset=real_base_offset++;
        for (wc=base->chain;wc;wc=wc->next) {
            base_vector[base_vector_pos++] = wc->word->real_offset;
            real_words[wc->word->real_offset].baseword=base->offset;
        }
        base_vector[base_vector_pos++]=-1;
        
    }
    
    while (base) {
        if (base->l) create_base_vector(base->l);
        clb(base);
        base=base->r;
    }
}


struct minihdr {
    u_int32_t string_size;
    u_int32_t word_count;
    u_int32_t uniword_count;
    u_int32_t base_count;
    u_int32_t basevector_count;
} minihdr;

void save(void *mem,size_t len,char *name)
{
    int fd=open(name,O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd<0) {
        perror(name);
        exit(1);
    }
    if (write(fd,mem,len)!=len) {
        perror(name);
        exit(1);
    }
    close(fd);
}

struct removed_word {
    struct removed_word *next;
    char word[1];
} *removed_words=NULL;

char *trim(char *str)
{
    char *d,*c;
    while (*str && isspace(*str)) str++;
    for (c=d=str;*c;c++) if (!isspace(*c)) d=c+1;
    *d=0;
    return str;
    
}
void read_removed(void)
{
    char buf[256],*s;
    FILE *f;
    struct removed_word *w;
    f=fopen("removed.txt","r");
    if (!f) return;
    while (fgets(buf,256,f)) {
        s=trim(buf);
        if (!*s) continue;
        w=malloc(sizeof(*w)+strlen(s));
        w->next=removed_words;
        removed_words=w;
        strcpy(w->word,s);
    }
    fclose(f);
}

int is_removed(char *buf)
{
    struct removed_word *r;
    for (r=removed_words;r;r=r->next) {
        if (!strcmp(buf,r->word)) return 1;
    }
    return 0;
}

struct vulgaria {
    struct vulgaria *next;
    char pattern[1];
} *vulgaria;

void read_vulgaria()
{
    char buf[256];
    struct vulgaria *w,**ww;
    FILE *f;
    f=fopen("vulgaria.txt","r");
    if (!f) {
        perror("vulgaria.txt");
        exit(1);
    }
    ww=&vulgaria;
    while (fgets(buf,256,f)) {
        char *s=trim(buf);
        if (!*s) continue;
        w=malloc(sizeof(*w)+strlen(s));
        *ww=w;
        w->next=NULL;
        ww=&(w->next);
        strcpy(w->pattern,s);
    }
    fclose(f);
}

int pass_vulgar(char *word,char *pattern)
{
    if (*pattern == '-') {
        if(pass_vulgar(word,pattern+1)) return -1;
    }
    if (*pattern == '*') {
        //if (!pattern[1]) {
        //    printf("Star???\n");
        //    return 1;
        //}
        pattern=pattern+1;
        for (;*word;word++) if (pass_vulgar(word,pattern)) {
            //printf("Passes [%s] to [%s]\n",word,pattern);
            return 1;
        }
        return 0;
    }
    for (;;) {
        if (!*pattern) {
            if (*word) return 0;
            return 1;
        }
        if (*pattern == '*') {
            //printf("Star found\n");
            return 1;
        }
        if (*pattern++ != *word++) {
            //printf("No match\n");
            return 0;
        }
    }
}

int is_vulgar(char *word)
{
    struct vulgaria *w;
    int n;
    for (w=vulgaria;w;w=w->next) {
        if (n=pass_vulgar(word,w->pattern)) {
            //printf("Vulgar %d:%s [%s]\n",n,word,w->pattern);
            return (n != -1) ? 1 : 0;
        }
    }
    return 0;
}

char **adjnie;
int nadjnie;

int is_adjnie(char *s)
{
	int lo,hi,mid,n;
	if (strncmp(s,"nie",3)) return 0;
	lo=0;
	hi=nadjnie-1;
	while (lo <= hi) {
	    mid=(lo+hi)/2;
	    //printf("Compare [%s] [%s]\n",s,adjnie[mid]);
	    n=strcmp(s,adjnie[mid]);
	    if (!n) return 1;
	    if (n<0) hi=mid-1;
	    else lo=mid+1;
	}
	//exit(0);
	return 0;
}

void read_adjnie(void)
{
    char buf[256],*s;
    FILE *f;
    int ladjnie=10000;
    nadjnie=0;
    adjnie=malloc(ladjnie * sizeof(*adjnie));
    f=fopen("adjnie.txt","r");
    if (!f) return;
    while (fgets(buf,256,f)) {
        s=trim(buf);
        if (!*s) continue;
	if (nadjnie >= ladjnie) {
	    ladjnie += 10000;
	    adjnie=realloc(adjnie,ladjnie * sizeof(*adjnie));
	}
	adjnie[nadjnie++]=strdup(s);
    }
    fclose(f);
}

int neg_grama(char **grama,char *word,char *base)
{
    char *c,*din,*s,buf[8192];
    din=*grama;
    buf[0]=0;
    while (din && *din) {
	if (strncmp("adj:",din,4)) {
	    fprintf(stderr,"Bad grama [%s]-[%s] in [%s/%s]\n",din,*grama,word,base);
	    return 0;
	}
	c=strchr(din,'+');
	if (c) *c++=0;
	s=strstr(din,":neg");
	if (s) {
	    fprintf(stderr,"Negated grama [%s] in [%s]\n",*grama,word);
	    return 0;
	}
	s=strstr(din,":aff");
	if (s) {
	    memcpy(s+1,"neg",3);
	    strcat(buf,din);
	}
	else {
	    strcat(buf,din);
	    strcat(buf,":neg");
	}
	if (c) strcat(buf,"+");
	din=c;
    }
    *grama=strdup(buf);
    return 1;
}


main()
{
    struct stat sb;
    struct stat sb2;
    struct stat sb3;
    int fd;
    char *c,*d;
    char *buf;
    char *base;
    char *grama;
    char exbuf[256];
    struct linein li;
    if (stat("polimorfologik.txt",&sb)) exit(1);
    if (stat("extras.txt",&sb2)) exit(1);
    //if (stat("impspot.txt",&sb3)) exit(1);
    read_removed();
    read_vulgaria();
    read_adjnie();
    //morf_size=sb.st_size+sb2.st_size+sb3.st_size+1;
    morf_size=sb.st_size+sb2.st_size+1;
    morf_body=malloc(morf_size+1);
    fd=open("polimorfologik.txt",O_RDONLY);
    if (fd<0) exit(1);
    if (read(fd,morf_body,sb.st_size) != sb.st_size) {
        perror("dupa");
        exit(1);
    }
    close(fd);
    morf_body[sb.st_size]=0;
    for (c=d=morf_body;*c;) {
	if (!strncmp(c,"\xf0\x80\x99",3)) {
	    c+=3;
	    *d++='\'';
	}
	else {
	    *d++=*c++;
	}
    }
    sb.st_size=d-morf_body;
    morf_body[sb.st_size++]='\n';
    fd=open("extras.txt",O_RDONLY);
    if (fd<0) exit(1);
    if (read(fd,morf_body+sb.st_size,sb2.st_size) != sb2.st_size) {
        perror("dupa");
        exit(1);
    }
    close(fd);
    /*
    fd=open("impspot.txt",O_RDONLY);
    if (fd<0) exit(1);
    if (read(fd,morf_body+sb.st_size+sb2.st_size,sb3.st_size) != sb3.st_size) {
        perror("dupa");
        exit(1);
    }
    close(fd);
    */
    morf_size=sb.st_size+sb2.st_size;
    morf_body[morf_size]=0;
    for (c=morf_body,lineno=0;*c;c++) {
        if (*c == '\n') lineno++;
    }
    printf("Line count %d\n",lineno);
    lines=malloc(sizeof(*lines) * lineno);
    lineno=0;
    for (c=morf_body;c && *c;) {
        buf=c;
        c=strpbrk(c,"\r\n");
        if (c) {
	    while (*c == '\r' || *c == '\n') *c++=0;
	}
        base=strchr(buf,'\t');
        if (!base) continue;
        *base++=0;
        grama=strchr(base,'\t');
        if (!grama) continue;
        *grama++=0;
        if (buf < morf_body + sb.st_size) {
            if (is_removed(base)) {
                printf("Remv %s\n",base);
                continue;
            }
	    if (!strncmp(grama,"adj",3) && is_adjnie(base)) {
		//if (neg_grama(&grama,buf,base)) base += 3;
		continue;
	    }
	    if ((!strncmp(grama,"ppas",4) || !strncmp(grama,"pact",4)) &&
		strstr(grama,":neg") && !strncmp(buf,"nie",3)) {
		    continue;
	    }
            if (!strcmp(buf,"nie") || !strcmp(buf,"też") || !strcmp(buf,"także")) {
                printf("Fnd %s\n",buf);
                continue;
            }
        }
        if (good_grama(grama,buf)) {
            lines[lineno].word=buf;
            lines[lineno].base=base;
            lines[lineno++].grama=grama;
            //if (lineno > 100000) break;
        }
    }
        
    int i,j;
    printf("Przeczytane %d\n",lineno);
    for (i=0;i<lineno;i++) {
        j=rand() % lineno;
        li=lines[i];
        lines[i]=lines[j];
        lines[j]=li;
    }
    printf("Potasowane\n");
    int lpos=lineno,pos;
    for (pos=0;pos<lpos;pos++) {
        if (strcmp(lines[pos].grama,"refl")) continue;
	printf("Reflux %s\n",lines[pos].word);
	exit(1);
        for (;;) {
            lpos--;
            if (lpos <= pos) break;
            if (strcmp(lines[lpos].grama,"refl")) break;
        }
        if (lpos <= pos) continue;
        li=lines[pos];lines[pos]=lines[lpos];lines[lpos]=li;
    }
    printf ("LP = %d, ML=%d\n",lpos,lineno);
    for (pos=0;pos < lineno;pos++) {
        insert_word(lines[pos].word,lines[pos].base,lines[pos].grama);
    }
    printf("1. NMAIN = %d/%d, SM=%d, BC=%d\n",nstru_main,nstru_word,string_offset,base_count);
    
    simplify_grama_tree(main_words);
    nstru_word=0;
    // tutaj dopiero liczenie basewords!
    count_all_words(main_words);
    printf("Simply %d/%d\n",simply,nstru_word);
    printf("2. NMAIN = %d/%d, SM=%d, BC=%d\n",nstru_main,nstru_word,string_offset,base_count);
    
    real_words_count=nstru_word;
    real_words_sec_offset=real_words_count;
    real_words_offset=0;
    real_words=malloc(sizeof(struct real_word) * real_words_count);
    init_real_words(main_words);
    printf("%d/%d\n",real_words_offset,real_words_sec_offset);
    compute_base_vector(work_base);
    real_base=malloc(sizeof(*real_base) * base_count);
    base_vector=malloc(sizeof(*base_vector) * base_vector_count);
    printf("Base vector count %d\n",base_vector_count);
    create_base_vector(work_base);
    printf("RBO %d BVP %d\n",real_base_offset,base_vector_pos);


    minihdr.string_size=string_offset;
    minihdr.word_count=real_words_count;
    minihdr.uniword_count=real_words_offset;
    minihdr.base_count=base_count;
    minihdr.basevector_count=base_vector_pos;
    mkdir("semi",0755);
    save(&minihdr,sizeof(minihdr),"semi/header.dat");
    save(string_mem,string_offset,"semi/string.dat");
    save(real_words,sizeof(struct real_word) * real_words_count,"semi/words.dat");
    save(real_base,sizeof(struct real_base) * base_count,"semi/base.dat");
    save(base_vector,sizeof(*base_vector) * base_vector_pos,"semi/vector.dat");
}
