概述
ip_fragment函数用于判断是否进行分片,在没有设置DF标记的情况下进入分片,如果设置了DF标记,则继续判断,如果不允许DF分片或者收到的最大分片大于MTU大小,则回复ICMP,释放skb,其余情况仍然需要走分片;
ip_do_fragment是详细的分片流程,整个过程分为快速分片和慢速分片两种,如果存在分片列表frag_list,并且通过检查,则走快速路径,复制每个分片的ip头等信息之后,发送出去;如果不存在分片列表,或者分片列表检查失败,则走慢速路径,慢速路径会根据MTU大小,对整个数据进行重新划分,分配skb,进行数据拷贝,设置ip头等信息,然后发送出去;
源码分析
1 static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
2 unsigned int mtu,
3 int (*output)(struct net *, struct sock *, struct sk_buff *))
4 {
5 struct iphdr *iph = ip_hdr(skb);
6
7 /* 如果没有DF标记,则进行分片 */
8 if ((iph->frag_off & htons(IP_DF)) == 0)
9 return ip_do_fragment(net, sk, skb, output);
10
11 /* 有DF标记则继续判断 */
12
13 /* 不允许本地分片 || 分片最大长度>MTU */
14 if (unlikely(!skb->ignore_df ||
15 (IPCB(skb)->frag_max_size &&
16 IPCB(skb)->frag_max_size > mtu))) {
17 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
18 /* ICMP错误 */
19 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
20 htonl(mtu));
21 /* 释放skb */
22 kfree_skb(skb);
23 return -EMSGSIZE;
24 }
25
26 /* 其他情况,继续分片 */
27 return ip_do_fragment(net, sk, skb, output);
28 }
1 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
2 int (*output)(struct net *, struct sock *, struct sk_buff *))
3 {
4 struct iphdr *iph;
5 int ptr;
6 struct sk_buff *skb2;
7 unsigned int mtu, hlen, left, len, ll_rs;
8 int offset;
9 __be16 not_last_frag;
10 struct rtable *rt = skb_rtable(skb);
11 int err = 0;
12
13 /* for offloaded checksums cleanup checksum before fragmentation */
14 /* PARTIAL类型需要清除校验和 */
15 if (skb->ip_summed == CHECKSUM_PARTIAL &&
16 (err = skb_checksum_help(skb)))
17 goto fail;
18
19 /*
20 * Point into the IP datagram header.
21 */
22
23 iph = ip_hdr(skb);
24
25 /* 获取mtu */
26 mtu = ip_skb_dst_mtu(sk, skb);
27
28 /* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */
29 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
30 mtu = IPCB(skb)->frag_max_size;
31
32 /*
33 * Setup starting values.
34 */
35
36 hlen = iph->ihl * 4;
37 mtu = mtu - hlen; /* Size of data space */
38 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
39
40 /* When frag_list is given, use it. First, check its validity:
41 * some transformers could create wrong frag_list or break existing
42 * one, it is not prohibited. In this case fall back to copying.
43 *
44 * LATER: this step can be merged to real generation of fragments,
45 * we can switch to copy when see the first bad fragment.
46 */
47 /* 有分片列表 */
48 if (skb_has_frag_list(skb)) {
49 struct sk_buff *frag, *frag2;
50
51 /* 线性区域和分页区的数据长度 */
52 unsigned int first_len = skb_pagelen(skb);
53
54 /* 以下情况,进入慢路处理 */
55 if (first_len - hlen > mtu || /* 分片长度>MTU */
56 ((first_len - hlen) & 7) || /* 没有8字节对齐 */
57 ip_is_fragment(iph) || /* 是一个分片 */
58 skb_cloned(skb)) /* 是克隆的 */
59 goto slow_path;
60
61 /* 遍历分片列表 */
62 skb_walk_frags(skb, frag) {
63 /* Correct geometry. */
64 /* 以下情况,恢复状态,进入慢速路径 */
65 if (frag->len > mtu || /* 分片长度>mtu */
66 ((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */
67 skb_headroom(frag) < hlen) /* 头部长度过小 */
68 goto slow_path_clean;
69
70 /* Partially cloned skb? */
71 /* 克隆的,恢复状态,进入慢速路径 */
72 if (skb_shared(frag))
73 goto slow_path_clean;
74
75 BUG_ON(frag->sk);
76
77 /* 分片关联控制块 */
78 if (skb->sk) {
79 frag->sk = skb->sk;
80 frag->destructor = sock_wfree;
81 }
82
83 /* 第一个skb的长度去掉当前分片的长度 */
84 skb->truesize -= frag->truesize;
85 }
86
87 /* Everything is OK. Generate! */
88
89 /* 现在分片没问题了,设置分片信息 */
90 err = 0;
91 offset = 0;
92 frag = skb_shinfo(skb)->frag_list;
93 skb_frag_list_init(skb);
94 skb->data_len = first_len - skb_headlen(skb);
95 skb->len = first_len;
96 iph->tot_len = htons(first_len);
97 iph->frag_off = htons(IP_MF);
98 ip_send_check(iph);
99
100 /* 循环设置分片信息,并发送 */
101 for (;;) {
102 /* Prepare header of the next frame,
103 * before previous one went down. */
104 /* 为每一片都拷贝ip头,设置偏移信息 */
105 if (frag) {
106 frag->ip_summed = CHECKSUM_NONE;
107 skb_reset_transport_header(frag);
108 __skb_push(frag, hlen);
109 skb_reset_network_header(frag);
110 memcpy(skb_network_header(frag), iph, hlen);
111 iph = ip_hdr(frag);
112 iph->tot_len = htons(frag->len);
113 ip_copy_metadata(frag, skb);
114 if (offset == 0)
115 ip_options_fragment(frag);
116 offset += skb->len - hlen;
117 iph->frag_off = htons(offset>>3);
118 if (frag->next)
119 iph->frag_off |= htons(IP_MF);
120 /* Ready, complete checksum */
121 ip_send_check(iph);
122 }
123
124 /* 调用发送回调 */
125 err = output(net, sk, skb);
126
127 if (!err)
128 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
129 if (err || !frag)
130 break;
131
132 skb = frag;
133 frag = skb->next;
134 skb->next = NULL;
135 }
136
137 if (err == 0) {
138 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
139 return 0;
140 }
141
142 /* 出错,释放分片 */
143 while (frag) {
144 skb = frag->next;
145 kfree_skb(frag);
146 frag = skb;
147 }
148 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
149 return err;
150
151 slow_path_clean:
152 /* 将分片恢复原状态 */
153 skb_walk_frags(skb, frag2) {
154 if (frag2 == frag)
155 break;
156 frag2->sk = NULL;
157 frag2->destructor = NULL;
158 skb->truesize += frag2->truesize;
159 }
160 }
161
162 slow_path:
163 /* 慢速分片路径 */
164
165
166 iph = ip_hdr(skb);
167
168 /* 除去首部的剩余空间 */
169 left = skb->len - hlen; /* Space per frame */
170 ptr = hlen; /* Where to start from */
171
172 /* 二层头部空间 */
173 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
174
175 /*
176 * Fragment the datagram.
177 */
178
179 /* 初始化mf和offset */
180 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
181 not_last_frag = iph->frag_off & htons(IP_MF);
182
183 /*
184 * Keep copying data until we run out.
185 */
186
187 /* 开始分片了 */
188 while (left > 0) {
189 /* len初始为剩余长度 */
190 len = left;
191 /* IF: it doesn't fit, use 'mtu' - the data space left */
192 /* 根据mtu确认长度 */
193 if (len > mtu)
194 len = mtu;
195 /* IF: we are not sending up to and including the packet end
196 then align the next start on an eight byte boundary */
197 /* 除最后分片外,其余8字节对齐 */
198 if (len < left) {
199 len &= ~7;
200 }
201
202 /* Allocate buffer */
203 /* 分配skb */
204 skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
205 if (!skb2) {
206 err = -ENOMEM;
207 goto fail;
208 }
209
210 /*
211 * Set up data on packet
212 */
213
214 /* 拷贝元数据 */
215 ip_copy_metadata(skb2, skb);
216
217 /* 预留空间,设置头部偏移 */
218 skb_reserve(skb2, ll_rs);
219 skb_put(skb2, len + hlen);
220 skb_reset_network_header(skb2);
221 skb2->transport_header = skb2->network_header + hlen;
222
223 /*
224 * Charge the memory for the fragment to any owner
225 * it might possess
226 */
227 /* 关联sk */
228 if (skb->sk)
229 skb_set_owner_w(skb2, skb->sk);
230
231 /*
232 * Copy the packet header into the new buffer.
233 */
234
235 /* 拷贝头部 */
236 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
237
238 /*
239 * Copy a block of the IP datagram.
240 */
241 /* 拷贝数据 */
242 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
243 BUG();
244 left -= len;
245
246 /*
247 * Fill in the new header fields.
248 */
249 iph = ip_hdr(skb2);
250
251 /* 设置偏移 *//
252 iph->frag_off = htons((offset >> 3));
253
254 /* 转发的数据包,带有FRAG_PMTU标记,则打上DF */
255 if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
256 iph->frag_off |= htons(IP_DF);
257
258 /* ANK: dirty, but effective trick. Upgrade options only if
259 * the segment to be fragmented was THE FIRST (otherwise,
260 * options are already fixed) and make it ONCE
261 * on the initial skb, so that all the following fragments
262 * will inherit fixed options.
263 */
264 /* 第一个分片包含ip选项 */
265 if (offset == 0)
266 ip_options_fragment(skb);
267
268 /*
269 * Added AC : If we are fragmenting a fragment that's not the
270 * last fragment then keep MF on each bit
271 */
272 /* 不是最后分片需要设定MF标记 */
273 if (left > 0 || not_last_frag)
274 iph->frag_off |= htons(IP_MF);
275
276 /* 指针和偏移更新 */
277 ptr += len;
278 offset += len;
279
280 /*
281 * Put this fragment into the sending queue.
282 */
283 /* 设置数据长度 */
284 iph->tot_len = htons(len + hlen);
285
286 /* 校验和 */
287 ip_send_check(iph);
288
289 /* 发送分片 */
290 err = output(net, sk, skb2);
291 if (err)
292 goto fail;
293
294 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
295 }
296
297 /* 分片完成并发送,释放skb */
298 consume_skb(skb);
299 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
300 return err;
301
302 fail:
303
304 /* 出错,释放skb */
305 kfree_skb(skb);
306 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
307 return err;
308 }