- A+
文件IO流程
用户进程read、write在高速缓冲块上读写数据,高速缓冲块和块设备交换数据。
- 什么时机将磁盘块数据读到缓冲块?
- 什么时机将缓冲块数据刷到磁盘块?
函数调用关系
- read/write(c库函数,通过int 80调用sys_read/sys_write)
- sys_read/sys_write
- block_read/block_write
- breada
- getblk
- sync_dev
- ll_rw_block
- getblk
- breada
- block_read/block_write
- sys_read/sys_write
sys_read与sys_write
代码文件:linux-0.11/fs/read_write.c
系统调用sys_read与sys_write是内核提供给用户程序调用的IO接口。若IO设备是块设备,底层分别调用block_read与block_write进行块设备的读写。
sys_read
int sys_read(unsigned int fd,char * buf,int count) { struct file * file; struct m_inode * inode; // 通过文件描述符,在file表中找到file结构地址 if (fd>=NR_OPEN || count<0 || !(file=current->filp[fd])) return -EINVAL; if (!count) return 0; verify_area(buf,count); inode = file->f_inode; // 通过file的f_inode访问inode节点 //判断是什么设备:管道、字符设备、块设备 //如果是块设备,调用block_read读块设备 if (inode->i_pipe) return (file->f_mode&1)?read_pipe(inode,buf,count):-EIO; if (S_ISCHR(inode->i_mode)) return rw_char(READ,inode->i_zone[0],buf,count,&file->f_pos); if (S_ISBLK(inode->i_mode)) return block_read(inode->i_zone[0],&file->f_pos,buf,count); if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode)) { if (count+file->f_pos > inode->i_size) count = inode->i_size - file->f_pos; if (count<=0) return 0; return file_read(inode,file,buf,count); } printk("(Read)inode->i_mode=%06onr",inode->i_mode); return -EINVAL; }
sys_write
int sys_write(unsigned int fd,char * buf,int count) { struct file * file; struct m_inode * inode; if (fd>=NR_OPEN || count <0 || !(file=current->filp[fd])) return -EINVAL; if (!count) return 0; //判断是什么设备:管道、字符设备、块设备 //如果是块设备,调用block_write读块设备 inode=file->f_inode; if (inode->i_pipe) return (file->f_mode&2)?write_pipe(inode,buf,count):-EIO; if (S_ISCHR(inode->i_mode)) return rw_char(WRITE,inode->i_zone[0],buf,count,&file->f_pos); if (S_ISBLK(inode->i_mode)) return block_write(inode->i_zone[0],&file->f_pos,buf,count); if (S_ISREG(inode->i_mode)) return file_write(inode,file,buf,count); printk("(Write)inode->i_mode=%06onr",inode->i_mode); return -EINVAL; }
block_read与block_write
block_read与block_write负责块设备的读写。他们底层调用breada函数获取缓冲块,然后在缓冲块上读写数据。
block_write
代码文件:linux-0.11/fs/block_dev.c
int block_write(int dev, long * pos, char * buf, int count) { int block = *pos >> BLOCK_SIZE_BITS;// pos所在文件数据块号 int offset = *pos & (BLOCK_SIZE-1); // pos在数据块中偏移值 int chars; int written = 0; struct buffer_head * bh; //指向当前写缓冲块 register char * p; // 向缓冲块中写数据,通过getblk获取缓冲块,获取缓冲块的同时会读取磁盘块数据到缓冲块 // 数据量较多时,通过bread一次性缓存3个磁盘块数据到缓冲块,减小磁盘IO次数 while (count>0) { chars = BLOCK_SIZE - offset; if (chars > count) chars=count; if (chars == BLOCK_SIZE) //获取高速缓冲块,并建立其与磁盘块的映射关系 bh = getblk(dev,block); else // 读取的数据超过一个磁盘块,调用breada读多个块 // breada底层调用getblk缓存3个连续磁盘块的数据 bh = breada(dev,block,block+1,block+2,-1); block++; if (!bh) return written?written:-EIO; p = offset + bh->b_data; offset = 0; *pos += chars; written += chars; count -= chars; while (chars-->0) *(p++) = get_fs_byte(buf++); //完成对缓冲块的数据写入后,设置缓冲块的修改位dirt,然后释放缓冲块(引用计数减一) bh->b_dirt = 1; brelse(bh); } return written; }
block_read
代码文件:linux-0.11/fs/block_dev.c
int block_read(int dev, unsigned long * pos, char * buf, int count) { int block = *pos >> BLOCK_SIZE_BITS; int offset = *pos & (BLOCK_SIZE-1); int chars; int read = 0; struct buffer_head * bh; register char * p; while (count>0) { chars = BLOCK_SIZE-offset; if (chars > count) chars = count; if (!(bh = breada(dev,block,block+1,block+2,-1))) return read?read:-EIO; block++; p = offset + bh->b_data; offset = 0; *pos += chars; read += chars; count -= chars; while (chars-->0) put_fs_byte(*(p++),buf++); //完成对缓冲块的数据读取之后,释放缓冲块(引用计数减一) brelse(bh); } return read; }
bread
代码文件:linux-0.11/fs/buffer.c
- bread:块读取函数
- breada:块提前预读函数
- bread_page:页块读取函数,一个内存页通常为4k大小、磁盘块通常为1k大小
*bread、breada、bread_page三者功能相似,用法不同。三者均会调用getblk获取缓冲块,并调用ll_rw_block读缓冲块。
调用ll_rw_block读数据到缓冲块。
struct buffer_head * bread(int dev,int block) { struct buffer_head * bh; if (!(bh=getblk(dev,block))) panic("bread: getblk returned NULLn"); if (bh->b_uptodate) return bh; // 调用ll_rw_block读磁盘块数据到缓冲区 ll_rw_block(READ,bh); wait_on_buffer(bh); if (bh->b_uptodate) return bh; brelse(bh); return NULL; }
getblk
代码文件:linux-0.11/fs/buffer.c
bread系列函数通过getblk获取缓冲块,在必要的时候,会调用sync_dev函数将脏缓冲块数据写入磁盘。
getblk代码逻辑复杂,需要对资源可用性进行复杂的检查。资源不可用时,需要睡眠,被唤醒之后又要进行一些检查判断资源是否可用。复杂逻辑可以暂时不考虑,避免陷入代码细节。
仅考虑getblk获取空闲块之后的代码逻辑。getblk获取可用缓冲块后,若缓冲块dirt位为1,表示缓冲块有数据未同步到磁盘,getblk将调用sync_dev将数据同步到磁盘,然后占用该缓冲块。
struct buffer_head * getblk(int dev,int block) { struct buffer_head * tmp, * bh; repeat: // 搜索hash表,如果指定块已经在高速缓冲中,则返回对应缓冲区头指针,退出。 if ((bh = get_hash_table(dev,block))) return bh; // 扫描空闲数据块链表,寻找空闲缓冲区。 tmp = free_list; do { // 如果该缓冲区正被使用(引用计数不等于0) if (tmp->b_count) continue; // 找到可用缓冲块,且满足一些条件 if (!bh || BADNESS(tmp)<BADNESS(bh)) { bh = tmp; if (!BADNESS(tmp)) break; } /* and repeat until we find something good */ } while ((tmp = tmp->b_next_free) != free_list); // 没有可用缓冲块,则睡眠等待有空闲缓冲块可用。 // 当有空闲缓冲块可用时本进程会被的唤醒。 if (!bh) { sleep_on(&buffer_wait); //睡眠在缓冲区上 goto repeat; } //等待缓冲区解锁? wait_on_buffer(bh); if (bh->b_count) goto repeat; // 分配到的缓冲块dirt位为1(表示有数据未同步到磁盘) // 调用sync_dev将数据同步到磁盘,并睡眠在该缓冲块上 while (bh->b_dirt) { sync_dev(bh->b_dev); wait_on_buffer(bh); if (bh->b_count) goto repeat; } /* NOTE!! While we slept waiting for this block, somebody else might */ /* already have added "this" block to the cache. check it */ if (find_buffer(dev,block)) goto repeat; /* OK, FINALLY we know that this buffer is the only one of it's kind, */ /* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */ // 对空闲缓冲块的处理 // 占用空闲缓冲块。置引用计数为1,复位修改标志和有效(更新)标志。 bh->b_count=1; bh->b_dirt=0; bh->b_uptodate=0; // 从原hash队列和空闲队列块链表中移出该缓冲区头。根据此新的设备号和块号重新插入空闲链表和hash队列 // 让该缓冲区用于指定设备和其上的指定块。 // 根据此新的设备号和块号重新哈希,并插入响应的hash队列 remove_from_queues(bh); bh->b_dev=dev; bh->b_blocknr=block; //加锁 insert_into_queues(bh); return bh; }
sync_dev
代码文件:linux-0.11/fs/buffer.c
调用ll_rw_block将缓冲块内数据写入磁盘。
int sync_dev(int dev) { int i; struct buffer_head * bh; bh = start_buffer; for (i=0 ; i<NR_BUFFERS ; i++,bh++) { if (bh->b_dev != dev) continue; wait_on_buffer(bh); if (bh->b_dev == dev && bh->b_dirt) // 调用ll_rw_block写缓冲区数据到磁盘块 ll_rw_block(WRITE,bh); } bh = start_buffer; for (i=0 ; i<NR_BUFFERS ; i++,bh++) { if (bh->b_dev != dev) continue; wait_on_buffer(bh); if (bh->b_dev == dev && bh->b_dirt) ll_rw_block(WRITE,bh); } return 0; }
ll_rw_block
代码文件:linux-0.11/kernel/blk_drv/ll_rw_blk.c
将缓冲块的数据写入磁盘块,获将磁盘块数据读入缓冲块,底层通过设备请求队列完成读写。
void ll_rw_block(int rw, struct buffer_head * bh) { unsigned int major; if ((major=MAJOR(bh->b_dev)) >= NR_BLK_DEV || !(blk_dev[major].request_fn)) { printk("Trying to read nonexistent block-devicenr"); return; } // 将读写请求加入设备请求队列 make_request(major,rw,bh); }
设备中断处理程序
代码文件:linux-0.11/kernel/blk_drv/hd.c
- 读完成中断处理程序
设备完成读扇区数据后,发出读中断,读中断处理程序read_intr执行。若当前读请求还有数据要读,则继续完成当前请求的数据读。因为,一次读请求可能读若干连续扇区数据,磁盘每次只能写读一个扇区数据。完成一次读请求的所有数据读之后,将调用do_hd_request处理下一个写请求。
static void read_intr(void) { if (win_result()) { bad_rw_intr(); do_hd_request(); return; } port_read(HD_DATA,CURRENT->buffer,256); CURRENT->errors = 0; CURRENT->buffer += 512; CURRENT->sector++; if (--CURRENT->nr_sectors) { do_hd = &read_intr; return; } end_request(1); do_hd_request(); }
- 写完成中断处理程序
与写完成中断处理程序过程类似。
static void write_intr(void) { if (win_result()) { bad_rw_intr(); do_hd_request(); //处理下一个请求 return; } if (--CURRENT->nr_sectors) { CURRENT->sector++; CURRENT->buffer += 512; do_hd = &write_intr; port_write(HD_DATA,CURRENT->buffer,256); return; } end_request(1); do_hd_request(); }
- 处理读写队列请求
处理设备请求队列的读写请求。设备中断处理程序不断调用do_hd_request处理请求队列,直到请求队列为空。
void do_hd_request(void) { int i,r = 0; unsigned int block,dev; unsigned int sec,head,cyl; unsigned int nsect; INIT_REQUEST; dev = MINOR(CURRENT->dev); block = CURRENT->sector; if (dev >= 5*NR_HD || block+2 > hd[dev].nr_sects) { end_request(0); goto repeat; } block += hd[dev].start_sect; dev /= 5; __asm__("divl %4":"=a" (block),"=d" (sec):"0" (block),"1" (0), "r" (hd_info[dev].sect)); __asm__("divl %4":"=a" (cyl),"=d" (head):"0" (block),"1" (0), "r" (hd_info[dev].head)); sec++; nsect = CURRENT->nr_sectors; if (reset) { reset = 0; recalibrate = 1; reset_hd(CURRENT_DEV); return; } if (recalibrate) { recalibrate = 0; hd_out(dev,hd_info[CURRENT_DEV].sect,0,0,0, WIN_RESTORE,&recal_intr); return; } if (CURRENT->cmd == WRITE) { hd_out(dev,nsect,sec,head,cyl,WIN_WRITE,&write_intr); for(i=0 ; i<3000 && !(r=inb_p(HD_STATUS)&DRQ_STAT) ; i++) /* nothing */ ; if (!r) { bad_rw_intr(); goto repeat; } port_write(HD_DATA,CURRENT->buffer,256); } else if (CURRENT->cmd == READ) { hd_out(dev,nsect,sec,head,cyl,WIN_READ,&read_intr); } else panic("unknown hd-command"); }